In [39]:
!pip install sentence_transformers



In [40]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
df = pd.read_csv('/content/sample_data/anime.csv')
df = df.reset_index()
df.head(2)

Unnamed: 0,index,Anime-PlanetID,Name,Alternative Name,Rating Score,Number Votes,Tags,Content Warning,Type,Episodes,Finished,Duration,StartYear,EndYear,Season,Studios,Synopsis,Url
0,0,10,The Prince of Tennis,Tennis no Ouji-sama,4.037,10889,"Comedy, Drama, Shounen, Sports, Tennis, Based ...",Unknown,TV,178,True,Unknown,2001,2005,Fall 2001,"Production I.G, Trans Arts","Meet Ryoma Echizen, the cocky prince of tennis...",https://www.anime-planet.com/anime/the-prince-...
1,1,100,Neon Genesis Evangelion,Shinseiki Evangelion,4.248,54463,"Drama, Mecha, Sci Fi, Conspiracy, Kaijuu, Lone...","Emotional Abuse, Explicit Violence, Mature The...",TV,26,True,Unknown,1995,1996,Fall 1995,"GAINAX, Tatsunoko Production","In the future, a devastating event known as Se...",https://www.anime-planet.com/anime/neon-genesi...


In [42]:
df.columns

Index(['index', 'Anime-PlanetID', 'Name', 'Alternative Name', 'Rating Score',
       'Finished', 'Duration', 'StartYear', 'EndYear', 'Season', 'Studios',
       'Synopsis', 'Url'],
      dtype='object')

In [43]:
features = ['Rating Score','Number Votes','Studios','Synopsis', 'Tags', 'Episodes']

In [44]:
def combined_features(row):
    return str(row["Rating Score"])+" "+ str(row["Number Votes"])+" "+ str(row["Studios"])+" "+ str(row["Synopsis"])+" "+ str(row["Tags"])+" "+ str(row["Episodes"])+" "

def get_title_from_index(index):
    return df[df["index"] == index]["Name"].values[0]
def get_index_from_title(title):
    return df[df["Name"] == title]["index"].values[0]

In [45]:
df["combined_feature"]=df.apply(combined_features,axis=1)
df["combined_feature"].head()

Unnamed: 0,combined_feature
0,"4.037 10889 Production I.G, Trans Arts Meet Ry..."
1,"4.248 54463 GAINAX, Tatsunoko Production In th..."
2,4.35 23948 Kyoto Animation Half a year has pas...
3,2.8 131 Unknown The idol group 22/7 perform th...
4,1.271 21 Toei Animation No synopsis yet - chec...


In [46]:
cv = CountVectorizer()
count_matrix=cv.fit_transform(df["combined_feature"])

# Naruto

In [47]:
anime_liked = 'Naruto'

## CHATGPT

### BERT + cosine/correlation

In [48]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
from sentence_transformers import SentenceTransformer


# Preprocessing
df['Tags'] = df['Tags'].fillna('')
df['Studios'] = df['Studios'].fillna('')
df['Synopsis'] = df['Synopsis'].astype(str).fillna('')

df['Rating Score'] = pd.to_numeric(df['Rating Score'], errors='coerce').fillna(0)
df['Number Votes'] = pd.to_numeric(df['Number Votes'], errors='coerce').fillna(0)

# BERT Encoding (on Synopsis only)
model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = model.encode(df['Synopsis'].tolist(), convert_to_tensor=False)

# Convert to numpy array
embeddings_matrix = np.array(bert_embeddings)

# Get index of input anime
input_index = df[df["Name"] == anime_liked].index[0]
input_vector = embeddings_matrix[input_index]

# Cosine Similarity
cosine_scores = cosine_similarity([input_vector], embeddings_matrix)[0]

# Pearson Correlation Similarity
def compute_pearson(input_vec, all_vecs):
    return [pearsonr(input_vec, vec)[0] if not np.isnan(pearsonr(input_vec, vec)[0]) else 0 for vec in all_vecs]

correlation_scores = compute_pearson(input_vector, embeddings_matrix)

# Add similarities to DataFrame
df['Cosine Similarity'] = cosine_scores
df['Correlation Similarity'] = correlation_scores

# Filter and sort
filtered_df_cosine = df[df['Rating Score'] > 4.0].sort_values(by='Number Votes', ascending=False)
filtered_df_corr = filtered_df_cosine.copy()

# Show top N
recommendations_cosine = filtered_df_cosine.sort_values(by='Cosine Similarity', ascending=False).head(11)
recommendations_corr = filtered_df_corr.sort_values(by='Correlation Similarity', ascending=False).head(11)

print("Top 10 Anime Recommendations (Cosine Similarity):")
print(recommendations_cosine[['Name', 'Cosine Similarity', 'Rating Score', 'Number Votes']])

print("\nTop 10 Anime Recommendations (Correlation Similarity):")
print(recommendations_corr[['Name', 'Correlation Similarity', 'Rating Score', 'Number Votes']])


Top 10 Anime Recommendations (Cosine Similarity):
                                                   Name  Cosine Similarity  \
8371                                             Naruto           1.000000   
13288                          Boruto: Naruto the Movie           0.623281   
1397                                   Naruto Shippuden           0.622964   
11219           Naruto Shippuden Movie 6: Road to Ninja           0.553224   
13381              Haikyuu!! Movie 1: Owari to Hajimari           0.552476   
10430            Naruto Shippuden Movie 5: Blood Prison           0.520740   
12189                              Seitokai Yakuindomo*           0.502166   
14031                                  My Hero Academia           0.490787   
10461                         Full Metal Panic? Fumoffu           0.490092   
9105                                          Slam Dunk           0.487759   
9973   Sekai-ichi Hatsukoi: World's Greatest First Love           0.486885   

       Rating

### Tf-IDF + cosine/correlation

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization on Synopsis
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1500)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Synopsis']).toarray()

# Input vector
input_index = df[df["Name"] == anime_liked].index[0]
input_vector = tfidf_matrix[input_index]

# Cosine Similarity
cosine_scores = cosine_similarity([input_vector], tfidf_matrix)[0]

# Pearson Correlation
correlation_scores = [pearsonr(input_vector, vec)[0] if not np.isnan(pearsonr(input_vector, vec)[0]) else 0 for vec in tfidf_matrix]

# Add similarities to DataFrame
df['Cosine Similarity'] = cosine_scores
df['Correlation Similarity'] = correlation_scores

# Filter and sort
filtered_df_cosine = df[df['Rating Score'] > 4.0].sort_values(by='Number Votes', ascending=False)
filtered_df_corr = filtered_df_cosine.copy()

# Top results
recommendations_cosine = filtered_df_cosine.sort_values(by='Cosine Similarity', ascending=False).head(11)
recommendations_corr = filtered_df_corr.sort_values(by='Correlation Similarity', ascending=False).head(11)

print("Top 10 Anime Recommendations (TF-IDF + Cosine):")
print(recommendations_cosine[['Name', 'Cosine Similarity', 'Rating Score', 'Number Votes']])

print("\nTop 10 Anime Recommendations (TF-IDF + Correlation):")
print(recommendations_corr[['Name', 'Correlation Similarity', 'Rating Score', 'Number Votes']])


  correlation_scores = [pearsonr(input_vector, vec)[0] if not np.isnan(pearsonr(input_vector, vec)[0]) else 0 for vec in tfidf_matrix]


Top 10 Anime Recommendations (TF-IDF + Cosine):
                                                    Name  Cosine Similarity  \
8371                                              Naruto           1.000000   
13288                           Boruto: Naruto the Movie           0.547438   
1397                                    Naruto Shippuden           0.495983   
11219            Naruto Shippuden Movie 6: Road to Ninja           0.363760   
10430             Naruto Shippuden Movie 5: Blood Prison           0.322569   
12258                               Mushishi: Hihamukage           0.196470   
12886                         The Last: Naruto the Movie           0.186189   
8460                                                Erin           0.180439   
15579  KonoSuba – God’s blessing on this wonderful wo...           0.133759   
14928                                   Lu Over the Wall           0.130722   
8703                                   Princess Mononoke           0.130677   

   

## DEEPSEEK

### BERT

In [50]:
# import pandas as pd
# import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# from sentence_transformers import SentenceTransformer

# # Load dataset
# df = pd.read_csv('anime_data.csv')  # Replace with your dataset path
# df = pd.read_csv('/content/sample_data/anime.csv')
# df = df.reset_index()
# Preprocessing
df['Tags'] = df['Tags'].fillna('')
df['Studios'] = df['Studios'].fillna('')
df['Type'] = df['Type'].fillna('Unknown')
df['Synopsis'] = df['Synopsis'].fillna('')

# Convert and scale 'Number Votes'
df['Number Votes'] = pd.to_numeric(df['Number Votes'], errors='coerce').fillna(0)
scaler = StandardScaler()
scaled_votes = scaler.fit_transform(df[['Number Votes']])

# TF-IDF for Tags
tag_vectorizer = TfidfVectorizer(stop_words='english', max_features=600)
tag_matrix = tag_vectorizer.fit_transform(df['Tags']).toarray()

# One-Hot Encoding for Studios and Type
studio_encoder = OneHotEncoder(handle_unknown='ignore')
studios_encoded = studio_encoder.fit_transform(df[['Studios']]).toarray()
type_encoder = OneHotEncoder(handle_unknown='ignore')
type_encoded = type_encoder.fit_transform(df[['Type']]).toarray()

# BERT Embeddings for Synopsis
model = SentenceTransformer('all-mpnet-base-v2')
synopsis_embeddings = model.encode(df['Synopsis'].tolist(), show_progress_bar=True)

# Combine features
features_combined = np.hstack([
    scaled_votes, tag_matrix, studios_encoded, type_encoded, synopsis_embeddings
])

# Standardize for Pearson correlation
standardized_features = StandardScaler().fit_transform(features_combined)

# Compute similarity matrices
cosine_sim_pearson = cosine_similarity(standardized_features)
cosine_sim = cosine_similarity(features_combined)

# Recommendation function
def get_recommendations(sim_matrix, df, liked_index):
    sim_scores = list(enumerate(sim_matrix[liked_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [i for i in sim_scores if i[0] != liked_index]
    recommendations = []
    for idx, score in sim_scores:
        if df.iloc[idx]['Rating Score'] > 4.0:
            recommendations.append((idx, score))
    rec_df = df.iloc[[i[0] for i in recommendations]].copy()
    rec_df['Similarity Score'] = [i[1] for i in recommendations]
    rec_df = rec_df.sort_values(by='Number Votes', ascending=False)
    return rec_df[['Name', 'Rating Score', 'Number Votes', 'Similarity Score']]

# Get input anime index
# anime_liked = "Naruto"
liked_index = df[df['Name'] == anime_liked].index[0]

# Generate recommendations
pearson_rec = get_recommendations(cosine_sim_pearson, df, liked_index)
cosine_rec = get_recommendations(cosine_sim, df, liked_index)

print("Pearson-based Recommendations:")
print(pearson_rec.head(10))
print("\nCosine-based Recommendations:")
print(cosine_rec.head(10))

Batches:   0%|          | 0/520 [00:00<?, ?it/s]

Pearson-based Recommendations:
                                       Name  Rating Score  Number Votes  \
502                              Death Note         4.500      153675.0   
11626                       Attack on Titan         4.509      124167.0   
10730                      Sword Art Online         4.138      107210.0   
8548       Fullmetal Alchemist: Brotherhood         4.696      100159.0   
15870                         Spirited Away         4.590       97786.0   
1397                       Naruto Shippuden         4.311       96260.0   
14256                                Bleach         4.130       93436.0   
10622                   Fullmetal Alchemist         4.429       92770.0   
598    Code Geass: Lelouch of the Rebellion         4.549       85617.0   
13507                         One-Punch Man         4.578       85068.0   

       Similarity Score  
502            0.260461  
11626          0.186835  
10730          0.172773  
8548           0.136234  
15870        

### Tf-IDF

In [51]:
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
# df = pd.read_csv('anime_data.csv')  # Replace with your dataset path

# Preprocessing
df['Tags'] = df['Tags'].fillna('')
df['Studios'] = df['Studios'].fillna('')
df['Type'] = df['Type'].fillna('Unknown')
df['Synopsis'] = df['Synopsis'].fillna('')

# Convert and scale 'Number Votes'
df['Number Votes'] = pd.to_numeric(df['Number Votes'], errors='coerce').fillna(0)
scaler = StandardScaler()
scaled_votes = scaler.fit_transform(df[['Number Votes']])

# TF-IDF for Tags and Synopsis
tag_vectorizer = TfidfVectorizer(stop_words='english', max_features=600)
tag_matrix = tag_vectorizer.fit_transform(df['Tags']).toarray()
synopsis_vectorizer = TfidfVectorizer(stop_words='english', max_features=1500)
synopsis_matrix = synopsis_vectorizer.fit_transform(df['Synopsis']).toarray()

# One-Hot Encoding for Studios and Type
studio_encoder = OneHotEncoder(handle_unknown='ignore')
studios_encoded = studio_encoder.fit_transform(df[['Studios']]).toarray()
type_encoder = OneHotEncoder(handle_unknown='ignore')
type_encoded = type_encoder.fit_transform(df[['Type']]).toarray()

# Combine features
features_combined = np.hstack([
    scaled_votes, tag_matrix, synopsis_matrix, studios_encoded, type_encoded
])

# Standardize for Pearson correlation
standardized_features = StandardScaler().fit_transform(features_combined)

# Compute similarity matrices
cosine_sim_pearson = cosine_similarity(standardized_features)
cosine_sim = cosine_similarity(features_combined)

# Reuse the same recommendation function as Code 1
def get_recommendations(sim_matrix, df, liked_index):
    # ... (same as in Code 1)
    sim_scores = list(enumerate(sim_matrix[liked_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [i for i in sim_scores if i[0] != liked_index]
    recommendations = []
    for idx, score in sim_scores:
        if df.iloc[idx]['Rating Score'] > 4.0:
            recommendations.append((idx, score))
    rec_df = df.iloc[[i[0] for i in recommendations]].copy()
    rec_df['Similarity Score'] = [i[1] for i in recommendations]
    rec_df = rec_df.sort_values(by='Number Votes', ascending=False)
    return rec_df[['Name', 'Rating Score', 'Number Votes', 'Similarity Score']]

# Get input anime index and generate recommendations
# anime_liked = "Naruto"
liked_index = df[df['Name'] == anime_liked].index[0]

pearson_rec = get_recommendations(cosine_sim_pearson, df, liked_index)
cosine_rec = get_recommendations(cosine_sim, df, liked_index)

print("Pearson-based Recommendations:")
print(pearson_rec.head(10))
print("\nCosine-based Recommendations:")
print(cosine_rec.head(10))

Pearson-based Recommendations:
                                       Name  Rating Score  Number Votes  \
502                              Death Note         4.500      153675.0   
11626                       Attack on Titan         4.509      124167.0   
10730                      Sword Art Online         4.138      107210.0   
8548       Fullmetal Alchemist: Brotherhood         4.696      100159.0   
15870                         Spirited Away         4.590       97786.0   
1397                       Naruto Shippuden         4.311       96260.0   
14256                                Bleach         4.130       93436.0   
10622                   Fullmetal Alchemist         4.429       92770.0   
598    Code Geass: Lelouch of the Rebellion         4.549       85617.0   
13507                         One-Punch Man         4.578       85068.0   

       Similarity Score  
502            0.146419  
11626          0.092859  
10730          0.059933  
8548           0.062228  
15870        

# Attack on Titan

In [52]:
anime_liked = 'Attack on Titan'

## CHATGPT

### BERT

In [53]:
# import pandas as pd
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity
# from scipy.stats import pearsonr
# from sentence_transformers import SentenceTransformer


# # Preprocessing
# df['Tags'] = df['Tags'].fillna('')
# df['Studios'] = df['Studios'].fillna('')
# df['Synopsis'] = df['Synopsis'].astype(str).fillna('')

# df['Rating Score'] = pd.to_numeric(df['Rating Score'], errors='coerce').fillna(0)
# df['Number Votes'] = pd.to_numeric(df['Number Votes'], errors='coerce').fillna(0)

# # BERT Encoding (on Synopsis only)
# model = SentenceTransformer('all-MiniLM-L6-v2')
# bert_embeddings = model.encode(df['Synopsis'].tolist(), convert_to_tensor=False)

# # Convert to numpy array
# embeddings_matrix = np.array(bert_embeddings)

# Get index of input anime
input_index = df[df["Name"] == anime_liked].index[0]
input_vector = embeddings_matrix[input_index]

# Cosine Similarity
cosine_scores = cosine_similarity([input_vector], embeddings_matrix)[0]

# Pearson Correlation Similarity
def compute_pearson(input_vec, all_vecs):
    return [pearsonr(input_vec, vec)[0] if not np.isnan(pearsonr(input_vec, vec)[0]) else 0 for vec in all_vecs]

correlation_scores = compute_pearson(input_vector, embeddings_matrix)

# Add similarities to DataFrame
df['Cosine Similarity'] = cosine_scores
df['Correlation Similarity'] = correlation_scores

# Filter and sort
filtered_df_cosine = df[df['Rating Score'] > 4.0].sort_values(by='Number Votes', ascending=False)
filtered_df_corr = filtered_df_cosine.copy()

# Show top N
recommendations_cosine = filtered_df_cosine.sort_values(by='Cosine Similarity', ascending=False).head(11)
recommendations_corr = filtered_df_corr.sort_values(by='Correlation Similarity', ascending=False).head(11)

print("Top 10 Anime Recommendations (Cosine Similarity):")
print(recommendations_cosine[['Name', 'Cosine Similarity', 'Rating Score', 'Number Votes']])

print("\nTop 10 Anime Recommendations (Correlation Similarity):")
print(recommendations_corr[['Name', 'Correlation Similarity', 'Rating Score', 'Number Votes']])


Top 10 Anime Recommendations (Cosine Similarity):
                                      Name  Cosine Similarity  Rating Score  \
11626                      Attack on Titan           1.000000         4.509   
15486           Attack on Titan 3rd Season           0.642848         4.563   
13209           Attack on Titan 2nd Season           0.615868         4.505   
2096   Attack on Titan 3rd Season: Part II           0.484530         4.675   
11864     Attack on Titan: Ilse's Notebook           0.476438         4.174   
2416                            Ergo Proxy           0.473564         4.087   
4030                            Deca-Dence           0.456638         4.017   
7106               Mobile Suit Zeta Gundam           0.455170         4.081   
14471                No Game No Life: Zero           0.429409         4.433   
4034                      Brand New Animal           0.427638         4.177   
2391                             Dr. Stone           0.422884         4.514   

 

### Tf-IDF

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization on Synopsis
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1500)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Synopsis']).toarray()

# Input vector
input_index = df[df["Name"] == anime_liked].index[0]
input_vector = tfidf_matrix[input_index]

# Cosine Similarity
cosine_scores = cosine_similarity([input_vector], tfidf_matrix)[0]

# Pearson Correlation
correlation_scores = [pearsonr(input_vector, vec)[0] if not np.isnan(pearsonr(input_vector, vec)[0]) else 0 for vec in tfidf_matrix]

# Add similarities to DataFrame
df['Cosine Similarity'] = cosine_scores
df['Correlation Similarity'] = correlation_scores

# Filter and sort
filtered_df_cosine = df[df['Rating Score'] > 4.0].sort_values(by='Number Votes', ascending=False)
filtered_df_corr = filtered_df_cosine.copy()

# Top results
recommendations_cosine = filtered_df_cosine.sort_values(by='Cosine Similarity', ascending=False).head(11)
recommendations_corr = filtered_df_corr.sort_values(by='Correlation Similarity', ascending=False).head(11)

print("Top 10 Anime Recommendations (TF-IDF + Cosine):")
print(recommendations_cosine[['Name', 'Cosine Similarity', 'Rating Score', 'Number Votes']])

print("\nTop 10 Anime Recommendations (TF-IDF + Correlation):")
print(recommendations_corr[['Name', 'Correlation Similarity', 'Rating Score', 'Number Votes']])


  correlation_scores = [pearsonr(input_vector, vec)[0] if not np.isnan(pearsonr(input_vector, vec)[0]) else 0 for vec in tfidf_matrix]


Top 10 Anime Recommendations (TF-IDF + Cosine):
                                                    Name  Cosine Similarity  \
11626                                    Attack on Titan           1.000000   
2416                                          Ergo Proxy           0.195956   
5936      Tsubasa Reservoir Chronicle: Tokyo Revelations           0.191135   
12851                         Blood Blockade Battlefront           0.179152   
4034                                    Brand New Animal           0.174617   
7952                                Aria the Origination           0.167737   
13055                                  Seraph of the End           0.146888   
11435  A Certain Magical Index Movie: The Miracle of ...           0.144987   
12464                                      Durarara!! X2           0.141512   
10001                                              No. 6           0.139904   
15511                              DARLING in the FRANXX           0.134907   

   

## DEEPSEEK

### BERT

In [55]:
# TF-IDF for Tags
tag_vectorizer = TfidfVectorizer(stop_words='english', max_features=600)
tag_matrix = tag_vectorizer.fit_transform(df['Tags']).toarray()

# One-Hot Encoding for Studios and Type
studio_encoder = OneHotEncoder(handle_unknown='ignore')
studios_encoded = studio_encoder.fit_transform(df[['Studios']]).toarray()
type_encoder = OneHotEncoder(handle_unknown='ignore')
type_encoded = type_encoder.fit_transform(df[['Type']]).toarray()

# BERT Embeddings for Synopsis
# model = SentenceTransformer('all-mpnet-base-v2')
# synopsis_embeddings = model.encode(df['Synopsis'].tolist(), show_progress_bar=True)

# Combine features
features_combined = np.hstack([
    scaled_votes, tag_matrix, studios_encoded, type_encoded, synopsis_embeddings
])

# Standardize for Pearson correlation
standardized_features = StandardScaler().fit_transform(features_combined)

# Compute similarity matrices
cosine_sim_pearson = cosine_similarity(standardized_features)
cosine_sim = cosine_similarity(features_combined)

# Recommendation function
def get_recommendations(sim_matrix, df, liked_index):
    sim_scores = list(enumerate(sim_matrix[liked_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [i for i in sim_scores if i[0] != liked_index]
    recommendations = []
    for idx, score in sim_scores:
        if df.iloc[idx]['Rating Score'] > 4.0:
            recommendations.append((idx, score))
    rec_df = df.iloc[[i[0] for i in recommendations]].copy()
    rec_df['Similarity Score'] = [i[1] for i in recommendations]
    rec_df = rec_df.sort_values(by='Number Votes', ascending=False)
    return rec_df[['Name', 'Rating Score', 'Number Votes', 'Similarity Score']]

# Get input anime index
anime_liked = "Attack on Titan"
liked_index = df[df['Name'] == anime_liked].index[0]

# Generate recommendations
pearson_rec = get_recommendations(cosine_sim_pearson, df, liked_index)
cosine_rec = get_recommendations(cosine_sim, df, liked_index)

print("Pearson-based Recommendations:")
print(pearson_rec.head(11))
print("\nCosine-based Recommendations:")
print(cosine_rec.head(11))

Pearson-based Recommendations:
                                       Name  Rating Score  Number Votes  \
502                              Death Note         4.500      153675.0   
8371                                 Naruto         4.134      117389.0   
10730                      Sword Art Online         4.138      107210.0   
8548       Fullmetal Alchemist: Brotherhood         4.696      100159.0   
15870                         Spirited Away         4.590       97786.0   
1397                       Naruto Shippuden         4.311       96260.0   
14256                                Bleach         4.130       93436.0   
10622                   Fullmetal Alchemist         4.429       92770.0   
598    Code Geass: Lelouch of the Rebellion         4.549       85617.0   
13507                         One-Punch Man         4.578       85068.0   
14031                      My Hero Academia         4.467       82455.0   

       Similarity Score  
502            0.217792  
8371           0

### Tf-IDF

In [57]:

# # TF-IDF for Tags and Synopsis
tag_vectorizer = TfidfVectorizer(stop_words='english', max_features=600)
tag_matrix = tag_vectorizer.fit_transform(df['Tags']).toarray()
synopsis_vectorizer = TfidfVectorizer(stop_words='english', max_features=1500)
synopsis_matrix = synopsis_vectorizer.fit_transform(df['Synopsis']).toarray()

# # One-Hot Encoding for Studios and Type
studio_encoder = OneHotEncoder(handle_unknown='ignore')
studios_encoded = studio_encoder.fit_transform(df[['Studios']]).toarray()
type_encoder = OneHotEncoder(handle_unknown='ignore')
type_encoded = type_encoder.fit_transform(df[['Type']]).toarray()

# # Combine features
features_combined = np.hstack([
    scaled_votes, tag_matrix, synopsis_matrix, studios_encoded, type_encoded
])

# # Standardize for Pearson correlation
standardized_features = StandardScaler().fit_transform(features_combined)

# # Compute similarity matrices
cosine_sim_pearson = cosine_similarity(standardized_features)
cosine_sim = cosine_similarity(features_combined)

# # Reuse the same recommendation function as Code 1
# def get_recommendations(sim_matrix, df, liked_index):
#     # ... (same as in Code 1)
#     sim_scores = list(enumerate(sim_matrix[liked_index]))
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#     sim_scores = [i for i in sim_scores if i[0] != liked_index]
#     recommendations = []
#     for idx, score in sim_scores:
#         if df.iloc[idx]['Rating Score'] > 4.0:
#             recommendations.append((idx, score))
#     rec_df = df.iloc[[i[0] for i in recommendations]].copy()
#     rec_df['Similarity Score'] = [i[1] for i in recommendations]
#     rec_df = rec_df.sort_values(by='Number Votes', ascending=False)
#     return rec_df[['Name', 'Rating Score', 'Number Votes', 'Similarity Score']]

# Get input anime index and generate recommendations
# anime_liked = "Attack on "
liked_index = df[df['Name'] == anime_liked].index[0]

pearson_rec = get_recommendations(cosine_sim_pearson, df, liked_index)
cosine_rec = get_recommendations(cosine_sim, df, liked_index)
print(anime_liked)
print("Pearson-based Recommendations:")
print(pearson_rec.head(11))
print("\nCosine-based Recommendations:")
print(cosine_rec.head(11))

Attack on Titan
Pearson-based Recommendations:
                                       Name  Rating Score  Number Votes  \
502                              Death Note         4.500      153675.0   
8371                                 Naruto         4.134      117389.0   
10730                      Sword Art Online         4.138      107210.0   
8548       Fullmetal Alchemist: Brotherhood         4.696      100159.0   
15870                         Spirited Away         4.590       97786.0   
1397                       Naruto Shippuden         4.311       96260.0   
14256                                Bleach         4.130       93436.0   
10622                   Fullmetal Alchemist         4.429       92770.0   
598    Code Geass: Lelouch of the Rebellion         4.549       85617.0   
13507                         One-Punch Man         4.578       85068.0   
14031                      My Hero Academia         4.467       82455.0   

       Similarity Score  
502            0.087681  


# Perplexity

In [56]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MultiLabelBinarizer  # NEW: For tag encoding

# Preprocessing
df['Tags'] = df['Tags'].fillna('')
df['Studios'] = df['Studios'].fillna('')
df['Synopsis'] = df['Synopsis'].astype(str).fillna('')

# CHANGED: Convert tags to list and encode
df['Tags'] = df['Tags'].apply(lambda x: x.split(', ') if x else [])
mlb = MultiLabelBinarizer()
tag_encoded = mlb.fit_transform(df['Tags'])

# BERT Encoding (Synopsis)
model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = model.encode(df['Synopsis'].tolist(), convert_to_tensor=False)

# CHANGED: Combine BERT embeddings with tag features
combined_features = np.hstack([bert_embeddings, tag_encoded])

# Get index of input anime
input_index = df[df["Name"] == anime_liked].index[0]
input_vector = combined_features[input_index]

# Calculate similarities
cosine_scores = cosine_similarity([input_vector], combined_features)[0]
correlation_scores = [pearsonr(input_vector, vec)[0] if not np.isnan(pearsonr(input_vector, vec)[0]) else 0
                      for vec in combined_features]

# Add to DataFrame
df['Cosine Similarity'] = cosine_scores
df['Correlation Similarity'] = correlation_scores

# CHANGED: Filter first then sort by similarity
filtered_df = df[df['Rating Score'] > 4.0]
recommendations_cosine = filtered_df.sort_values(by='Cosine Similarity', ascending=False).head(10)
recommendations_corr = filtered_df.sort_values(by='Correlation Similarity', ascending=False).head(10)

print("Top 10 Anime Recommendations (Cosine Similarity):")
print(recommendations_cosine[['Name', 'Cosine Similarity', 'Rating Score']])

print("\nTop 10 Anime Recommendations (Correlation Similarity):")
print(recommendations_corr[['Name', 'Correlation Similarity', 'Rating Score']])
