In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
df = pd.read_csv('C:\\Users\\acer\\Desktop\\all\\Flutter\\anime_rec\\backend\\dataset\\anime.csv')
df = df.reset_index()
df.head(2)

Unnamed: 0,index,Anime-PlanetID,Name,Alternative Name,Rating Score,Number Votes,Tags,Content Warning,Type,Episodes,Finished,Duration,StartYear,EndYear,Season,Studios,Synopsis,Url
0,0,10,The Prince of Tennis,Tennis no Ouji-sama,4.037,10889,"Comedy, Drama, Shounen, Sports, Tennis, Based ...",Unknown,TV,178,True,Unknown,2001,2005,Fall 2001,"Production I.G, Trans Arts","Meet Ryoma Echizen, the cocky prince of tennis...",https://www.anime-planet.com/anime/the-prince-...
1,1,100,Neon Genesis Evangelion,Shinseiki Evangelion,4.248,54463,"Drama, Mecha, Sci Fi, Conspiracy, Kaijuu, Lone...","Emotional Abuse, Explicit Violence, Mature The...",TV,26,True,Unknown,1995,1996,Fall 1995,"GAINAX, Tatsunoko Production","In the future, a devastating event known as Se...",https://www.anime-planet.com/anime/neon-genesi...


In [None]:
df.columns

In [None]:
features = ['Rating Score','Number Votes','Studios','Synopsis', 'Tags', 'Episodes']

In [None]:
def combined_features(row):
    return str(row["Rating Score"])+" "+ str(row["Number Votes"])+" "+ str(row["Studios"])+" "+ str(row["Synopsis"])+" "+ str(row["Tags"])+" "+ str(row["Episodes"])+" "

def get_title_from_index(index):
    return df[df["index"] == index]["Name"].values[0]
def get_index_from_title(title):
    return df[df["Name"] == title]["index"].values[0]

In [None]:
df["combined_feature"]=df.apply(combined_features,axis=1)
df["combined_feature"].head()

In [None]:
cv = CountVectorizer()
count_matrix=cv.fit_transform(df["combined_feature"])

In [None]:
anime_liked = 'Naruto'

In [None]:
import pickle
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

df['Tags'] = df['Tags'].fillna('')
df['Studios'] = df['Studios'].fillna('')
df['Type'] = df['Type'].fillna('Unknown')
df['Synopsis'] = df['Synopsis'].fillna('')
df['Rating Score'] = pd.to_numeric(df['Rating Score'], errors='coerce').fillna(0)  # Fix rating scores
# Convert and scale 'Number Votes'
df['Number Votes'] = pd.to_numeric(df['Number Votes'], errors='coerce').fillna(0)
scaler = StandardScaler()
scaled_votes = scaler.fit_transform(df[['Number Votes']])

# TF-IDF for Tags
tag_vectorizer = TfidfVectorizer(stop_words='english', max_features=600)
tag_matrix = tag_vectorizer.fit_transform(df['Tags']).toarray()

# One-Hot Encoding for Studios and Type
studio_encoder = OneHotEncoder(handle_unknown='ignore')
studios_encoded = studio_encoder.fit_transform(df[['Studios']]).toarray()
type_encoder = OneHotEncoder(handle_unknown='ignore')
type_encoded = type_encoder.fit_transform(df[['Type']]).toarray()

# BERT Embeddings for Synopsis
model = SentenceTransformer('all-mpnet-base-v2')
synopsis_embeddings = model.encode(df['Synopsis'].tolist(), show_progress_bar=True)

# Combine features
features_combined = np.hstack([
    scaled_votes, tag_matrix, studios_encoded, type_encoded, synopsis_embeddings
])

# Standardize for Pearson correlation
standardized_features = StandardScaler().fit_transform(features_combined)

# Compute similarity matrices
cosine_sim_pearson = cosine_similarity(standardized_features)
cosine_sim = cosine_similarity(features_combined)


def get_recommendations(sim_matrix, df, liked_index):
    sim_scores = list(enumerate(sim_matrix[liked_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [i for i in sim_scores if i[0] != liked_index]
    recommendations = []
    for idx, score in sim_scores:
        if df.iloc[idx]['Rating Score'] > 4.0:
            recommendations.append((idx, score))
    rec_df = df.iloc[[i[0] for i in recommendations]].copy()
    rec_df['Similarity Score'] = [i[1] for i in recommendations]
    rec_df = rec_df.sort_values(by='Similarity Score', ascending=False)
    return rec_df[['Name', 'Rating Score', 'Number Votes', 'Similarity Score']]


anime_recommender_model = {
    'df': df,  
    'scaler': scaler,  
    'tag_vectorizer': tag_vectorizer,  
    'studio_encoder': studio_encoder,  
    'type_encoder': type_encoder,     
    'features_combined': features_combined,
    'cosine_sim': cosine_sim,  
    'cosine_sim_pearson': cosine_sim_pearson,  
    'get_recommendations': get_recommendations  
}


with open('..\\model\\anime_recommender_model.pkl', 'wb') as f:
    pickle.dump(anime_recommender_model, f)

liked_index = df[df['Name'] == anime_liked].index[0]


# pearson_rec = get_recommendations(cosine_sim_pearson, df, liked_index)
cosine_rec = get_recommendations(cosine_sim, df, liked_index)

# print("Pearson-based Recommendations:")
# print(pearson_rec.head(10))
print("\nCosine-based Recommendations:")
print(cosine_rec.head(10))

In [None]:
# Import libraries
import necessary_modules()

# Load and preprocess data
load_data()
fill_missing_values()
convert_and_scale(['Rating Score', 'Number Votes'])

# Feature extraction
tags_tfidf = tfidf_vectorize('Tags')
studios_encoded = one_hot_encode('Studios')
types_encoded = one_hot_encode('Type')
synopsis_embed = bert_encode('Synopsis')

# Combine features and compute similarity
features = combine([scaled_votes, tags_tfidf, studios_encoded, types_encoded, synopsis_embed])
cosine_sim = compute_cosine_similarity(features)

# Recommendation function
def get_recommendations(sim_matrix, df, liked_index):
    scores = get_sorted_similarities(sim_matrix[liked_index])
    filtered = filter_by_rating(df, scores, min_rating=4.0)
    return top_recommendations(filtered)

# Generate and display recommendations
liked_index = find_index('liked_anime')
recs = get_recommendations(cosine_sim, df, liked_index)
print(recs.head(10))
