In [1]:
import numpy as np
import pandas as pd
import gensim
import math

In [2]:
test_queries = ["music movie", "bakugan battle brawlers", "tenchi shounen", "azuki daizu show", "comedy", "boy superpowers", "harem cats", "ninja fighting", "fairy tail", "2004 shounen show"]

data = pd.read_csv('test_dataset_labeled.csv')
data['Input Data'] = data['Name'].apply(str) + " " + data['Genres'].apply(str) + " " + data['Type'].apply(str) + " " + data['Aired'].apply(str) + " " + data['Studios'].apply(str) + " " + data['sypnopsis'].apply(str)

In [3]:
# LSA Model
import spacy
import re
import string
from spacy.lang.en.stop_words import STOP_WORDS

spacy_nlp = spacy.load('en_core_web_sm')

punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def spacy_tokenizer(sentence):
    sentence = re.sub('\'','',sentence)
    sentence = re.sub('\w*\d\w*','',sentence)
    sentence = re.sub(' +',' ',sentence)
    sentence = re.sub(r'\n: \'\'.*','',sentence)
    sentence = re.sub(r'\n!.*','',sentence)
    sentence = re.sub(r'^:\'\'.*','',sentence)
    sentence = re.sub(r'\n',' ',sentence)
    sentence = re.sub(r'[^\w\s]',' ',sentence)
    tokens = spacy_nlp(sentence)
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations and len(word) > 2]
    
    return tokens

data['input_data_tokenized'] = data['Input Data'].map(lambda x: spacy_tokenizer(x))
input_data = data['input_data_tokenized']
dictionary = gensim.corpora.Dictionary(input_data)
stoplist = set('hello and if this can would should could tell ask stop come go')
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)
corpus = [dictionary.doc2bow(desc) for desc in input_data]
st_tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)
st_lsi_model = gensim.models.LsiModel(st_tfidf_model[corpus], id2word=dictionary, num_topics=300)

from operator import itemgetter

st_tfidf_corpus = gensim.corpora.MmCorpus('test_set_tfidf_model_mm')
st_lsi_corpus = gensim.corpora.MmCorpus('test_set_lsi_model_mm')
st_index = gensim.similarities.MatrixSimilarity(st_lsi_corpus, num_features = st_lsi_corpus.num_terms)

def search_anime(search_term, num_best=5):

    query_bow = dictionary.doc2bow(spacy_tokenizer(search_term))
    query_tfidf = st_tfidf_model[query_bow]
    query_lsi = st_lsi_model[query_tfidf]

    st_index.num_best = num_best

    st_list = st_index[query_lsi]

    st_list.sort(key=itemgetter(1), reverse=True)
    st_names = []

    for j, dentry in enumerate(st_list):

        st_names.append (
            {
                'Relevance': round((dentry[1] * 100),2),
                'Name': data['Name'][dentry[0]],
                'Genres' : data['Genres'][dentry[0]],
                'Synopsis' : data['sypnopsis'][dentry[0]],
                'MAL_ID' : data['MAL_ID'][dentry[0]]
            }

        )
        if j == (st_index.num_best-1):
            break

    return pd.DataFrame(st_names, columns=['Relevance','Name','Genres','Synopsis','MAL_ID'])

In [14]:
# VSM Model
import glob
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from collections import OrderedDict

def wordList_removePuncs(doc_dict):
  stop = stopwords.words('english') + list(string.punctuation) + ['\n']
  wordList = []
  for doc in doc_dict.values():
    for word in word_tokenize(doc.lower().strip()): 
      if not word in stop:
        wordList.append(word)
  return wordList

def termFrequencyInDoc(vocab, doc_dict):
  tf_docs = {}
  for doc_id in doc_dict.keys():
    tf_docs[doc_id] = {}
  
  for word in vocab:
    for doc_id,doc in doc_dict.items():
      tf_docs[doc_id][word] = doc.count(word)
  return tf_docs

def wordDocFre(vocab, doc_dict):
  df = {}
  for word in vocab:
    frq = 0
    for doc in doc_dict.values():
      # if word in doc.lower().split():
      if word.lower() in word_tokenize(doc.lower().strip()):
        frq = frq + 1
    df[word] = frq
  return df

def inverseDocFre(vocab,doc_fre,length):
  idf= {} 
  for word in vocab:     
    idf[word] = np.log2((length+1) / doc_fre[word])
  return idf

def tfidf(vocab,tf,idf_scr,doc_dict):
  tf_idf_scr = {}
  for doc_id in doc_dict.keys():
    tf_idf_scr[doc_id] = {}
  for word in vocab:
    for doc_id,doc in doc_dict.items():
      tf_idf_scr[doc_id][word] = tf[doc_id][word] * idf_scr[word]
  return tf_idf_scr

def vectorSpaceModel(query, doc_dict,tfidf_scr, num_best=5):
  query_vocab = []
  for word in query.split():
    if word not in query_vocab:
      query_vocab.append(word)

  query_wc = {}
  for word in query_vocab:
    query_wc[word] = query.lower().split().count(word)
  
  relevance_scores = {}
  for doc_id in doc_dict.keys():
    score = 0
    for word in query_vocab:
      score += query_wc[word] * tfidf_scr[doc_id].get(word, 0)
    relevance_scores[doc_id] = score
  sorted_value = OrderedDict(sorted(relevance_scores.items(), key=lambda x: x[1], reverse = True))
  top_results = {k: sorted_value[k] for k in list(sorted_value)[:num_best]}
  return top_results


docs = {}
for i in data.index:
  docs[i] = str(data.loc[i, 'Input Data'])

M = len(docs)                                 #number of files in dataset
w_List = wordList_removePuncs(docs)           #returns a list of tokenized words
vocab = list(set(w_List))                     #returns a list of unique words
tf_dict = termFrequencyInDoc(vocab, docs)     #returns term frequency
df_dict = wordDocFre(vocab, docs)             #returns document frequencies
idf_dict = inverseDocFre(vocab,df_dict,M)     #returns idf scores
tf_idf = tfidf(vocab,tf_dict,idf_dict,docs)   #returns tf-idf socres

In [5]:
# MAP functions

def precision_at_k(r, k):
  assert k >= 1
  r = np.asarray(r)[:k] != 0
  return np.mean(r)

def avg_precision(r):
  sum_p = 0
  total_relevant = 0
  
  for k in range(len(r)):
    if r[k] != 0:
      total_relevant += 1
      sum_p += precision_at_k(r,k+1)
  
  try:
    avg_p = sum_p/total_relevant
  except:
    avg_p = 0
  return avg_p

def mean_avg_precision(rs):
  sum_ave_p = 0
  for r in rs:
    sum_ave_p += avg_precision(r)
  m_avg_p = sum_ave_p/len(rs)
  return m_avg_p  

In [25]:
# mean nDCG functions

def dcg_at_k(r, k):
  dcg = r[0] # first term
  for i in range(k-1):
    dcg += r[i+1] / math.log2(i+2)
    
  return dcg

def ndcg_at_k(r, k):
  dcg = dcg_at_k(r,k)
  
  r = sorted(r, reverse=True)
  idcg = dcg_at_k(r,k)
  return dcg/idcg if idcg!=0 else 0

def mean_ndcg(rs):
  sum_ndcg = 0
  for r in rs:
    k = len(r)
    sum_ndcg += ndcg_at_k(r,k)
  m_ndcg = sum_ndcg/len(rs)
  return m_ndcg

In [7]:
# MRR functions

def reciprocal_rank(r):
  for i in range(len(r)):
    if r[i] != 0 :
      return 1/(i+1)
  return 0

def mean_reciprocal_rank(rs):
  sum_rr = 0
  for r in rs:
    sum_rr += reciprocal_rank(r)
  m_rr = sum_rr/len(rs)
  return m_rr

In [26]:
rs_vsm = []
for query in test_queries:
  result = vectorSpaceModel(query,docs,tf_idf)
  r = []
  
  for index in result:
    r.append(data.loc[index, [query]].values[0])
  rs_vsm.append(r)

print("___VSM Model___")
print(f"MAP: {mean_avg_precision(rs_vsm)}")
print(f"mean nDCG: {mean_ndcg(rs_vsm)}")
print(f"MRR: {mean_reciprocal_rank(rs_vsm)}")

___VSM Model___
MAP: 0.5533333333333333
mean nDCG: 0.5592314863560516
MRR: 0.6


In [33]:
# evaluating LSA
rs_lsa = []
for query in test_queries:
  result = search_anime(query, num_best=5)
  result = result["MAL_ID"].to_numpy()
  r = []
  for mal_id in result:
    r.append(data[data["MAL_ID"]==mal_id][query].values[0])
  rs_lsa.append(r)

print("___LSA Model___")
print(f"MAP: {mean_avg_precision(rs_lsa)}")
print(f"mean nDCG: {mean_ndcg(rs_lsa)}")
print(f"MRR: {mean_reciprocal_rank(rs_lsa)}")

___LSA Model___
MAP: 0.35444444444444445
mean nDCG: 0.5186668036159905
MRR: 0.3166666666666667
