In [28]:
import pandas as pd
import numpy as np 
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
from sentence_transformers import SentenceTransformer
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from rank_bm25 import BM25Okapi #better version of tfidf
import warnings
warnings.filterwarnings("ignore")

In [6]:
anime_data=pd.read_csv(r'D:\NYUMSDS_courses\Projects\ConsumerReviewData\imdb_anime.csv', on_bad_lines='skip')

In [7]:
anime_data.head()
print(anime_data.columns)
print(len(anime_data))

Index(['Title', 'Genre', 'User Rating', 'Number of Votes', 'Runtime', 'Year',
       'Summary', 'Stars', 'Certificate', 'Metascore', 'Gross', 'Episode',
       'Episode Title'],
      dtype='object')
45717


In [8]:
col_select=['Title', 'Summary']
new_df=anime_data[col_select]
new_df['Summary'].fillna('', inplace=True)
new_df['Summary']=new_df['Title']+'. '+new_df['Summary'].map(str)
new_df.dropna(inplace=True)
new_df.drop_duplicates(inplace=True)

In [9]:
new_df.head()

Unnamed: 0,Title,Summary
0,One Piece,One Piece. Follows the adventures of Monkey D....
1,Teenage Mutant Ninja Turtles: Mutant Mayhem,Teenage Mutant Ninja Turtles: Mutant Mayhem. T...
2,The Super Mario Bros. Movie,The Super Mario Bros. Movie. A plumber named M...
3,Attack on Titan,Attack on Titan. After his hometown is destroy...
4,Jujutsu Kaisen,Jujutsu Kaisen. A boy swallows a cursed talism...


In [10]:
def movie_search_term(df:pd.DataFrame, title_col:str, term: str):
    df[title_col]=df[title_col].str.lower()
    movies=df[title_col].values
    options=[(i,m) for i,m in enumerate(movies) for word in m.split(' ') if word==term.lower()]
    return options

In [11]:
df=new_df
title='Title'
term='One'
movie_search_term(df=df, title_col=title, term=term)

[(0, 'one piece'),
 (44, 'one punch man'),
 (72, 'one piece film: red'),
 (197, 'one piece: the movie'),
 (287, 'one piece: stampede'),
 (334, 'one piece film: gold'),
 (337, 'one piece film z'),
 (456, 'one piece: strong world'),
 (548, 'one piece: baron omatsuri and the secret island'),
 (593, 'tenpuru -no one can live on loneliness-'),
 (664, 'a thousand & one nights'),
 (693, 'one piece: heart of gold'),
 (695, 'one piece: clockwork island adventure'),
 (756, 'one piece: dead end adventure'),
 (848,
  "one piece - episode of east blue: luffy and his four friends' great adventure"),
 (911, 'level 1 demon lord and one room hero'),
 (1014,
  'one piece: episode of alabasta - the desert princess and the pirates'),
 (1016, 'one piece: episode of skypiea'),
 (1033,
  'one piece: episode of nami - tears of a navigator and the bonds of friends'),
 (1060, 'one piece: the cursed holy sword'),
 (1080, 'one piece: episode of luffy - adventure on hand island'),
 (1131, "one piece: 3d2y - overco

In [12]:
#pick a movie index
movie_idx=695

In [13]:
#preprocess context
def normalize(d):
    stopwords=nltk.corpus.stopwords.words('english')
    d=re.sub(r'[^a-zA-Z0-9\s]','',d,re.I|re.A)
    d=d.lower().strip()
    sentence_tokens=nltk.word_tokenize(d)
    full_tokens=[t for t in sentence_tokens if t not in stopwords]
    return ' '.join(full_tokens)

In [14]:
test_norm=normalize(new_df['Summary'][0])
print(test_norm)

one piece follows adventures monkey luffy pirate crew order find greatest treasure ever left legendary pirate gold roger famous mystery treasure named one piece


In [24]:
def normalized_corpus(df: pd.DataFrame, descript_col: str, tokens=False):
    max_length=0
    n_corpus=np.vectorize(normalize)
    if tokens:
        norm_corpus=n_corpus(list(df[descript_col])) #normalize each of the descriptions 
        tokenized_corpus=[nltk.word_tokenize(d) for d in norm_corpus]
        if max_length==0:
            max_length=max([len(tok_corp) for tok_corp in tokenized_corpus])
        
        #padding
        padded_tok_corps=pad_sequences(tokenized_corpus, maxlen=max_length, padding='post', dtype=object)
        return np.array(padded_tok_corps)
    else:
        return n_corpus(list(df[descript_col]))

In [16]:
def tf_idf_features(df: pd.DataFrame, normalized_corpus):
    #include both unigrams and bigrams
    tf_idf=TfidfVectorizer(ngram_range=(1,2), min_df=2)
    tfidf_array=tf_idf.fit_transform(normalized_corpus)
    return tfidf_array

In [17]:
def vector_cosine_sim(tfidf_array):
    return pd.DataFrame(cosine_similarity(tfidf_array))

In [51]:
def bm25_weights(tokenized_corpus):
    bm25 = BM25Okapi(tokenized_corpus)
    average_idf = sum(bm25.idf.values()) / len(bm25.idf)

    # Convert tokenized_corpus to a list of NumPy arrays for faster operations
    tokenized_corpus_np = [np.array(doc) for doc in tokenized_corpus]
    
    bm25_weights = {}
    
    # Precompute document lengths and average document length
    doc_lengths = np.array([len(doc) for doc in tokenized_corpus_np])
    avgdl = doc_lengths.mean()

    for term in set(term for doc in tokenized_corpus for term in doc):
        idf = bm25.idf.get(term, average_idf)
        term_weights = []

        # Vectorize term frequency computation
        term_frequencies = np.array([np.sum(doc == term) for doc in tokenized_corpus_np])
        valid_docs = term_frequencies > 0  # Only consider documents where term frequency > 0

        if np.any(valid_docs):
            tf = term_frequencies[valid_docs]
            dl = doc_lengths[valid_docs]
            norm_tf = tf * (bm25.k1 + 1) / (tf + bm25.k1 * (1 - bm25.b + bm25.b * (dl / avgdl)))
            term_weights = idf * norm_tf

        bm25_weights[term] = np.pad(term_weights, (0, len(tokenized_corpus_np) - len(term_weights)), 'constant')

    return pd.DataFrame(bm25_weights)

In [19]:
def bert_weights(corpus):
    model=SentenceTransformer('bert-base-nli-mean-tokens')
    vectors=model.encode(corpus)
    weights=pd.DataFrame(cosine_similarity(vectors))
    return weights

In [49]:
def recommend_movie(df:pd.DataFrame, title_col:str, index: int, vector, n: int):
    """vector: cos_similarity_matrix"""
    similarities=vector.iloc[index].values
    sim_idxs=np.argsort(-similarities)[1:n+1] #indices based on desc similarities; skip the first movie since its what is originally selected
    movies=df[title_col].values
    similar_movies=movies[sim_idxs]
    return similar_movies

In [47]:
movie_summary='Summary'
norm_corp=normalized_corpus(df=new_df, descript_col=movie_summary,)
tfidf_matrix=tf_idf_features(df=new_df, normalized_corpus=norm_corp)
cosine_sim_matrix=vector_cosine_sim(tfidf_matrix)
cosine_sim_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26355,26356,26357,26358,26359,26360,26361,26362,26363,26364
0,1.0,0.018423,0.012557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.017053,0.010103,0.0,0.0,0.0,0.013713,0.0,0.045604,0.0
1,0.018423,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.027681,0.0,0.0,0.0,0.0,0.0,0.00453
2,0.012557,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.014331,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.014241,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.060903,0.0,0.009309,0.0,...,0.034803,0.035403,0.039466,0.047332,0.054357,0.032324,0.018693,0.062598,0.0,0.010428


In [50]:
num_movies=5
reccs=recommend_movie(df=new_df, title_col=title, index=movie_idx, vector=cosine_sim_matrix, n=num_movies)
print(reccs)

#issue here is if the movie titles only contain the same title (further preprocesssing needed)

['one piece' 'one piece' 'one piece' 'one piece' 'one piece']


In [26]:
new_df[new_df[title]=='one piece'].values

array([['one piece',
        'One Piece. Follows the adventures of Monkey D. Luffy and his pirate crew in order to find the greatest treasure ever left by the legendary Pirate, Gold Roger. The famous mystery treasure named "One Piece".'],
       ['one piece',
        'One Piece. Luffy reflects on his past and how he came in possession of his Straw Hat.'],
       ['one piece',
        'One Piece. Luffy convinces Zoro to join her crew, but they must defeat Captain Morgan first. Meanwhile, Koby is accepted as a marine.'],
       ...,
       ['one piece',
        'One Piece. Luffy and Katakuri both land simultaneous blows ending their battle with both of them on the ground. Pekoms helps Luffy by bringing Brulee to allow him to escape from the mirror world.'],
       ['one piece',
        "One Piece. Sanji and Luffy make it back to the sunny. The fire tank pirates make it to their destination but their ship gets destroyed by Big Mom. As the straw hats are surrounded Jinbie's old crew the Su

In [None]:
#BM25 method requires documents in the form of tokens
bm_25_norm_corp=normalized_corpus(df=new_df, descript_col=movie_summary, tokens=True) 
bm_25_norm_corp[:3]
bm25_wts=bm25_weights(bm_25_norm_corp)
bm25_wts.head()

In [None]:
bm25_reccs=recommend_movie(df=new_df, title_col=title, index=movie_idx, vector=bm25_wts_df, n=num_movies)
print(bm25_reccs)

In [69]:
#BERT method (Takes a lot of CPU! Reduced new_df to 10k to test it out)
movie_summary='Summary'
norm_corp=normalized_corpus(df=new_df.iloc[:10000], descript_col=movie_summary,)
bert_wts=bert_weights(norm_corp)

In [72]:
bert_reccs=recommend_movie(df=new_df, title_col=title, index=movie_idx, vector=bert_wts, n=num_movies)
print(bert_reccs[:2])

['one piece: baron omatsuri and the secret island'
 'lupin iii: princess of the breeze']
