# Homework 3 - What is the best anime in the world?

In [10]:
import requests
from bs4 import BeautifulSoup as bs
import os
import pickle
import numpy as np
import time
import re
import datetime as dt 
import csv
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

## 1. Data collection

### 1.1. Get the list of animes

In [2]:
def take_n_urls(n):

    main_url = "https://myanimelist.net/topanime.php"

    # this list will contain all the urls we'll retrieve
    urls = [] 

    # each page shows 50 elements and we can retrieve each page by manipulating the "limit" query
    for limit in range(0, n, 50): 
        content = requests.get(main_url,
                               params={"limit": limit})
        if content.status_code == 404:
            print(f"Page with limit {limit} was not found. Interrumpting and returning pages found")
            break
        soup = bs(content.content, "html.parser")

        # from the content of each page we retrieve the portions that contain the urls
        results = soup.find_all("a", 
                                class_= "hoverinfo_trigger fl-l ml12 mr8")

        # finally, we take the string containing each url by taking the attribute href,
        # and we append them in the urls list
        for result in results:
            url = result["href"]
            if url not in urls:  # check for duplicates
                urls.append(url)

    return urls

In [27]:
if "urls.txt" not in os.listdir():
    urls = take_n_urls(20000)
    # Since the output of this step has to be a txt file, here we write one with each
    # url separated by a newline
    with open("urls.txt", "w") as file:
        file.write("\n".join(urls_str))
else:
    with open("urls.txt", "r", encoding="utf8") as file:
        print("Loading urls...")
        urls = file.read().split("\n")
        print("Done!")

Loading urls...
Done!


In [4]:
# we end up with 19131 urls. 
# I added a check that tells us when we have exceeded the length of the ranking list and returns what has been found
# up until that moment (so to avoid losing any more time with get requests that point to nothing)
# I know in the assignment they said 20000 but I'm fairly sure that's all the entries. 
# This is easy to see if we manually set the limit in the url and check the results. 
# For example: https://myanimelist.net/topanime.php?limit=15000 contains rankings 15001-15050. The first entry is
# Big X Episode 0. If we check our list with urls[15000] (remember that our list is 0-indexed) we obtain the same result.
# This to me seems to point to a correct behavior from the function, but let me know what you think.

print(len(urls)) 
print(urls[15000])
urls_str = list(map(str, urls))

19130
https://myanimelist.net/anime/30839/Big_X_Episode_0


### 1.2. Crawl animes

Attention: the index of article start with 0 and not 1 so all ranks are shifted by 1 position

In [5]:
# Here we create the directory where the html pages will be stored
if "html_pages" not in os.listdir():
    os.mkdir("html_pages")

In [6]:
def save_html_pages(urls):
    if "counter_pages" not in os.listdir():
        start = 0
    else:
        with open("counter_pages", "rb") as counter_file:
            start = pickle.load(counter_file) + 1

    print(f"Starting from anime #{start}")
    n = len(urls)
    for i in range(start, n):
        ranking_page = str(int(np.floor(i/50)))
        if i % 50 == 0 or f"ranking_page_{ranking_page}" not in os.listdir("./html_pages"):
            os.mkdir(f"html_pages/ranking_page_{ranking_page}")
        html_page = requests.get(urls[i])
        sleep_timer = 60
        while html_page.status_code != 200: # if the status_code is not 200, we've exceeded the number of requests and have to wait
            print(f"Exceeded number of requests while retrieving page #{i}.\nWaiting {sleep_timer} seconds")
            html_page.close()
            time.sleep(sleep_timer)
            html_page = requests.get(urls[i])
            sleep_timer += 10
        with open (f"html_pages/ranking_page_{ranking_page}/article_{i}.html", "w", encoding="utf-8") as file:
            file.write(html_page.text)
        with open ("counter_pages", "wb") as counter_file:
            pickle.dump(i, counter_file)


In [7]:
save_html_pages(urls)

Starting from anime #19130


### 1.3 Parse downloaded pages

At this point, you should have all the html documents about the animes of interest and you can start to extract the animes informations. The list of information we desire for each anime and their format is the following:

1. Anime Name (to save as `animeTitle`): String
2. Anime Type (to save as `animeType`): String
3. Number of episode (to save as `animeNumEpisode`): Integer
4. Release and End Dates of anime (to save as `releaseDate` and `endDate`): Convert both release and end date into datetime format.
5. Number of members (to save as `animeNumMembers`): Integer
6. Score (to save as `animeScore`): Float
7. Users (to save as `animeUsers`): Integer
8. Rank (to save as `animeRank`): Integer
9. Popularity (to save as `animePopularity`): Integer
10. Synopsis (to save as `animeDescription`): String
11. Related Anime (to save as `animeRelated`): Extract all the related animes, but only keep unique values and those that have a hyperlink associated to them. List of strings.
12. Characters (to save as `animeCharacters`): List of strings.
13. Voices (to save as `animeVoices`): List of strings
14. Staff (to save as `animeStaff`): Include the staff name and their responsibility/task in a list of lists.


In [50]:
path_ex_aurelie ='C:/Users/aurel/OneDrive/Bureau/IMT/3ème année IMT/0_Cours Sapienza/ADM/Homework/Homework 3'
path_ex_alessandro = "."


In [9]:
# Here we create the directory where the tsv files will be stored
if "tsv_files" not in os.listdir():
    os.mkdir("tsv_files")

In [10]:

def collect_info(num_article, folder='tsv_files'):
    ranking_page = str(int(np.floor(num_article/50)))
    article=f'{path_ex_aurelie}/html_pages/ranking_page_{ranking_page}/article_{num_article}.html'
    with open(article, "r", encoding="utf-8") as file:
        art= bs(file.read(), 'html.parser')
    
    #animeTitle
    animeTitle = art.find('h1', {'class':"title-name h1_bold_none"}).string
    #print('animeTitle :',animeTitle)
    
    
    #animeType
    animeType = art.find('span', {'class':"information type"}).string
    #print('animeType :',animeType)
    
    
    #animeNumEpisode and Dates (there is not specific name for those two info)
    #list lines with tag <div class="spaceit_pad">
    lines = art.find_all('div', {'class':"spaceit_pad"})
    for line in lines :
        #for each div tag there is one span, so here we look for the span tag with 'Episodes:' and 'Aired'
        sp= line.find('span', {'class':"dark_text"})
        # to avoid error if there is no span
        if sp is not None :
            #for span 'Episodes' (and the div tag which corresponds)
            if sp.string == 'Episodes:' :
                #extract the content of the right div tag and take the third line which correspond to the number of episodes
                if line.contents[2] != '\n  Unknown\n  ' :
                    animeNumEpisode = int(line.contents[2])
                    #animeNumEpisode = int(re.findall(r'-?\d+\.?\d*', str(line))[0])           #if we want to use regex  
                else :
                    animeNumEpisode = ''
            #for span 'Aired' (and the div tag which corresponds)
            if sp.string == 'Aired:' :
                str_dates = line.contents[2].split('\n  ')[1]
                if str_dates == 'Not available':
                    releaseDate = ''
                    endDate = ''
                else :
                    #if "Status: Finished Airing" (there is a endDate)
                    if ('to' in str_dates) and ('?' not in str_dates):
                        #extract the content of the right div tag and take the third line which correspond to the dates (fix the issue of '\n')
                        str_releaseDate, str_endDate = str_dates.split(' to ')

                        #choose the right datetime format of str_releaseDate 
                        if len(str_releaseDate.split(' '))==3:
                            date_format_releaseDate = "%b %d, %Y"
                        elif len(str_releaseDate.split(' '))==2:
                            date_format_releaseDate = "%b %Y"
                        else :
                            date_format_releaseDate = "%Y"
                        #convert str_releaseDate into a datetime
                        releaseDate = dt.datetime.strptime(str_releaseDate, date_format_releaseDate)

                        #choose the right datetime format of str_endDate 
                        if len(str_endDate.split(' '))==3:
                            date_format_endDate = "%b %d, %Y"
                        elif len(str_endDate.split(' '))==2:
                            date_format_endDate = "%b %Y"
                        else :
                            date_format_endDate = "%Y"
                        #convert str_releaseDate into a datetime
                        endDate = dt.datetime.strptime(str_endDate, date_format_endDate)

                    else :
                        str_releaseDate = str_dates.split(' to ')[0]
                        #choose the right datetime format of str_releaseDate 
                        if len(str_releaseDate.split(' '))==3:
                            date_format_releaseDate = "%b %d, %Y"
                        elif len(str_releaseDate.split(' '))==2:
                            date_format_releaseDate = "%b %Y"
                        else :
                            date_format_releaseDate = "%Y"
                        #convert str_releaseDate into a datetime
                        releaseDate = dt.datetime.strptime(str_releaseDate, date_format_releaseDate)

                        endDate=''
    #print('animeNumEpisode :',animeNumEpisode)
    #print('releaseDate :',releaseDate)
    #print('endDate :',endDate)
    
    
    #animeNumMembers
    animeNumMembers = int(art.find('span', {'class':"numbers members"}).contents[1].string.replace(',',''))
    #print('animeNumMembers :',animeNumMembers)
    
    
    #animeScore
    score = art.find('div', {'class':"score-label"}).string
    if score == 'N/A':
        animeScore = ''
    else :
        animeScore = float(score)
    #print('animeScore :',animeScore)
    
    
    #animeUsers
    if art.find('span', {'itemprop':"ratingCount"}) is not None :
        animeUsers = int(art.find('span', {'itemprop':"ratingCount"}).string)
    else :
        animeUsers = ''
    #print('animeUsers :',animeUsers)
    
    
    #animeRank
    if art.find('span', {'class':"numbers ranked"}).contents[1].string != 'N/A':
        animeRank = int(art.find('span', {'class':"numbers ranked"}).contents[1].string.replace('#',''))
    else :
        animeRank =''
    #print('animeRank :',animeRank)
    
    
    #animePopularity
    animePopularity = int(art.find('span', {'class':"numbers popularity"}).contents[1].string.replace('#',''))
    #print('animePopularity :',animePopularity)
    
    
    #animeDescription
    desc = art.find('p', {'itemprop':"description"}).contents
    animeDescription=''
    #remove <br/> Tag and '\n'
    for ele in desc :
        #delete tags with regex 
        ele = re.sub(re.compile('<.*?>'),'', str(ele))
        animeDescription += ele
        animeDescription = animeDescription.replace('\n','')
    #print('animeDescription :',animeDescription.replace('\n',''))
    
    
    #animeRelated
    animeRelated = []
    #store the table which contain related animes
    table = art.find('table', {'class':"anime_detail_related_anime"})
    if table is not None :
        #store all links/anime related with 'a' Tag
        links = table.find_all('a')
        for link in links :
            # check if there is a hyperlink and add it in the list if yes 
            if (link.get('href') is not None) and (link.string is not None):
                animeRelated += [link.string]
        animeRelated=list(set(animeRelated))
    else :
        animeRelated=''
    #print('animeRelated :',animeRelated)

    
    #animeCharacters
    animeCharacters = art.find_all('h3', {'class':"h3_characters_voice_actors"})
    animeCharacters = [char.string for char in animeCharacters]
    #print('animeCharacters :',animeCharacters)
    
    
    #animeVoices
    td_Voices = art.find_all('td', {'class':"va-t ar pl4 pr4"})
    animeVoices = [voice.find('a').string for voice in td_Voices]
    #print('animeVoices :',animeVoices)
    
    
    #animeStaff
    #if there is a staff, the div which correspond to the table Staff is the second one (there are div with {'class':"detail-characters-list clearfix"})
    if len(art.find_all('div', {'class':"detail-characters-list clearfix"}))>1 :
        div_staff = art.find_all('div', {'class':"detail-characters-list clearfix"})[1] 
        td_staff = div_staff.find_all('td', {'class':"borderClass"})
        animeStaff=[]
        for td in td_staff :
            if td.get('width') == None:
                animeStaff.append([td.find('a').string,td.find('small').string])
    #if there is not staff
    else :
        animeStaff = ''
    #print('animeStaff :',animeStaff)
    
    #create a .tsv file with attributes
    with open(f'{folder}/anime_{num_article}', 'wt', encoding="utf8") as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow([animeTitle, animeType, animeNumEpisode, releaseDate, endDate, animeNumMembers, \
                            animeScore, animeUsers, animeRank, animePopularity, animeDescription, animeRelated, \
                            animeCharacters, animeVoices, animeStaff])
    

In [9]:
for i in range(17346, len(urls)):
    collect_info(i)


## 2. Search Engine

### 2.0. Pre-processing

1. Removing stopwords
2. Removing punctuation
3. Stemming
4. Anything else you think it's needed

In [5]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aurel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aurel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
# For stemming
porter = nltk.stem.PorterStemmer()
lancaster = nltk.stem.LancasterStemmer()

# For identifying the stop words
eng_stopwords = stopwords.words("english")

In [23]:
def process_text(text):
    try:
        tokenized = nltk.word_tokenize(text)
        stemmed = [porter.stem(word) for word in tokenized if ((word.lower() not in eng_stopwords) and (word not in string.punctuation))]
    except TypeError as e:
        print(text)
        raise TypeError
    return stemmed

In [24]:
import re

# To sort the files correctly
def alphanumeric_sort(key):
    num = int(re.search("([0-9]+)", key).group(0))
    return num

def merge_tsvs(path, colnames):
    files = sorted(os.listdir(path), key=alphanumeric_sort)
    df = pd.read_csv(path+files[0],
                     names=colnames,
                     sep="\t", engine='python')
    for file_name in files[1:]:
        df2 = pd.read_csv(path+file_name,
                          names=colnames,
                          sep="\t", engine='python')
        df = pd.concat([df, df2], ignore_index=True)
    return df

In [160]:
# testing the implemented sorting algorithm
#print(sorted(os.listdir("./tsv_files/"), key=alphanumeric_sort))

In [161]:
path = "./tsv_files/"
colnames = ["animeTitle", "animeType", "animeNumEpisode", "releaseDate", "endDate", "animeNumMembers",
            "animeScore", "animeUsers", "animeRank", "animePopularity", "animeDescription", "animeRelated",
            "animeCharacters", "animeVoices", "animeStaff"]
df = merge_tsvs(path, colnames)

# Save our df in csv format
df.to_csv("./html_df.csv")

In [162]:
# create a copy of our dataframe with an extra column containing the preprocessed synopsis
df_new = df.assign(tokenized_desc=df["animeDescription"].apply(lambda x: process_text(x)))

In [163]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19130 entries, 0 to 19129
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   animeTitle        19130 non-null  object 
 1   animeType         19130 non-null  object 
 2   animeNumEpisode   18620 non-null  float64
 3   releaseDate       18760 non-null  object 
 4   endDate           8451 non-null   object 
 5   animeNumMembers   19130 non-null  int64  
 6   animeScore        13436 non-null  float64
 7   animeUsers        13436 non-null  float64
 8   animeRank         17307 non-null  float64
 9   animePopularity   19130 non-null  int64  
 10  animeDescription  19130 non-null  object 
 11  animeRelated      12706 non-null  object 
 12  animeCharacters   19130 non-null  object 
 13  animeVoices       19130 non-null  object 
 14  animeStaff        10247 non-null  object 
 15  tokenized_desc    19130 non-null  object 
dtypes: float64(4), int64(2), object(10)
memo

In [164]:
with open("df_with_tokens.p", "wb") as file:
    pickle.dump(df_new, file)

### 2.1. Conjunctive query
#### 2.1.1) Create your index!

Before building the index, 
* Create a file named `vocabulary`, in the format you prefer, that maps each word to an integer (`term_id`).

Then, the first brick of your homework is to create the Inverted Index. It will be a dictionary of this format:

```
{
term_id_1:[document_1, document_2, document_4],
term_id_2:[document_1, document_3, document_5, document_6],
...}
```
where _document\_i_ is the *id* of a document that contains the word.


In [3]:
# First i chose to create the vocabulary file as a DataFrame store as a .csv file 
def create_vocabulary(corpus, name_voc_file = "vocabulary.pkl"):
    voc = set()
    i=0
    for doc in corpus :
        #print(i)
        #i+=1
        voc = voc.union(set(doc))

    dict_voc = dict(zip(sorted(voc),range(len(voc))))
    with open(name_voc_file, "wb") as file:
        pickle.dump(dict_voc, file)
    return dict_voc

In [21]:
def inverted_index(corpus,voc, name_inv_ind_file = "inverted_index.pkl"):
    #create a inverted_index "empty", i.e. only with term_id of vocabulary
    inverted_index = dict()
    for term, term_id in voc.items() :
        inverted_index[term_id]=set()
        #inverted_index['term_id_'+str(term_id)]=set()

    for doc, num_doc in zip(corpus,range(len(corpus))) :
        #print(num_doc)
        words_checked = []
        for word in doc :
            if word not in words_checked: # to avoid looking for the same word more than once in the same doc
                words_checked.append(word)
                term_id = voc[word]
                inverted_index[term_id]=inverted_index[term_id].union(set([num_doc]))
                #inverted_index['term_id_'+str(term_id)]=inverted_index['term_id_'+str(term_id)].union(set(['document_'+str(num_doc)]))
    
    for term_id, docs in inverted_index.items() :
        inverted_index[term_id]=sorted(list(inverted_index[term_id]))
    
    #save the inverted_index as a .pkl file
    with open(name_inv_ind_file, "wb") as file:
        pickle.dump(inverted_index,file)
    
    return inverted_index

In [22]:
#test with a simple case

L=['A','C', 'B', 'A', 'E']
corpus_test=[L]+[['B','D']]
create_vocabulary(corpus_test)

with open("vocabulary.pkl", "rb") as file:
    voc_test = pickle.load(file)
        
inverted_index(corpus_test,voc_test)

with open("inverted_index.pkl", "rb") as file:
    inv_ind_test = pickle.load(file)
print(inv_ind_test)

{0: [0], 1: [0, 1], 2: [0], 3: [1], 4: [0]}


In [4]:
#download the dataset of synopsis stored in df_with_tokens.p
def download_corpus(name_file_corpus = 'df_with_tokens.p'):
    with open(name_file_corpus, 'rb') as file:
        df = pickle.load(file)

    corpus = list(df['tokenized_desc'])
    print(len(corpus))
    return corpus

#download the voc or create it if it does not already exist
def download_voc(corpus, name_voc_file):
    if name_voc_file not in os.listdir():
        voc = create_vocabulary(corpus, name_voc_file)
    else :
        with open(name_voc_file, "rb") as file:
            voc = pickle.load(file)
    return voc

#download the voc or create it if it does not already exist
def download_inverted_index(corpus,voc, name_inv_ind_file):
    if name_inv_ind_file not in os.listdir():
        inv_ind = inverted_index(corpus,voc, name_inv_ind_file)
    else :
        with open(name_inv_ind_file, "rb") as file:
            inv_ind = pickle.load(file)
    return inv_ind

In [11]:
#define the names of the interesting files 
name_voc_file = "vocabulary.pkl"
name_file_corpus = 'df_with_tokens.p'
name_inv_ind_file = "inverted_index.pkl"

#download the corpus and the vocabulary
corpus = download_corpus(name_file_corpus)
voc = download_voc(corpus, name_voc_file)
inv_ind = download_inverted_index(corpus,voc, name_inv_ind_file)

19130


#### 2.1.2) Execute the query
Given a query, that you let the user enter:

```saiyan race```

the Search Engine is supposed to return a list of documents.

##### What documents do we want?
Since we are dealing with conjunctive queries (AND), each of the returned documents should contain all the words in the query.
The final output of the query must return, if present, the following information for each of the selected documents:

* `animeTitle`
* `animeDescription`
* `Url`

__Example Output__:

| animeTitle | animeDescription | Url |
|:-----------------------------:|:-----:|:------------------------------------------------------------:|
| Fullmetal Alchemist: Brotherhood | ... | https://myanimelist.net/anime/5114/Fullmetal_Alchemist__Brotherhood |
| Gintama | ... | https://myanimelist.net/anime/28977/Gintama%C2%B0 |
| Shingeki no Kyojin Season 3 Part 2 | ... | https://myanimelist.net/anime/38524/Shingeki_no_Kyojin_Season_3_Part_2 |

If everything works well in this step, you can go to the next point, and make your Search Engine more complex and better in answering queries.


In [53]:
def search_engine_1(voc, inverted_index):
    #ask the query to the user
    query = input('What is your query ?').split(' ')
    
    first_word = query[0]
    
    if first_word in voc :
        first_term_id = voc[first_word]
        docs_list = set(inverted_index[first_term_id])
        for word in query[1:] :
            if word in voc :
                term_id = voc[word]
                docs = inverted_index[term_id]
                docs_list = docs_list.intersection(set(inverted_index[first_term_id]))
            else :
                print('Nothing correspond to your queries')
                return
        
        #Now we have the doc IDs so we can merge interesting information 
        html_df = pd.read_csv(path_ex_aurelie+"/html_df.csv") #csv which contains tsv line of each document
        cols = ["animeTitle", "animeDescription"]
        result = html_df.iloc[sorted(list(docs_list))][cols]
        result['Url'] = [urls[i] for i in sorted(list(docs_list))]
        
        return result
    
    else :
        print('Nothing correspond to your queries')
        return 

In [54]:
search_engine_1(voc, inv_ind)

What is your query ?saiyan race


Unnamed: 0,animeTitle,animeDescription,Url
364,Dragon Ball Z,Five years after winning the World Martial Art...,https://myanimelist.net/anime/813/Dragon_Ball_Z
401,Dragon Ball Super: Broly,"Forty-one years ago on Planet Vegeta, home of ...",https://myanimelist.net/anime/36946/Dragon_Bal...
1035,Dragon Ball Kai,"Five years after the events of Dragon Ball, ma...",https://myanimelist.net/anime/6033/Dragon_Ball...
1467,Dragon Ball Z Special 1: Tatta Hitori no Saish...,"Bardock, Son Goku's father, is a low-ranking S...",https://myanimelist.net/anime/986/Dragon_Ball_...
1961,Dragon Ball Super,"Seven years after the events of Dragon Ball Z,...",https://myanimelist.net/anime/30694/Dragon_Bal...
2015,Dragon Ball Z Movie 14: Kami to Kami,"Following the defeat of a great adversary, Gok...",https://myanimelist.net/anime/14837/Dragon_Bal...
2296,"Dragon Ball Z Movie 08: Moetsukiro!! Nessen, R...",As Goku investigates the destruction of the So...,https://myanimelist.net/anime/901/Dragon_Ball_...
4339,Dragon Ball: Ossu! Kaettekita Son Gokuu to Nak...,Based on an original concept by the original a...,https://myanimelist.net/anime/5152/Dragon_Ball...
4673,Dragon Ball Z Movie 10: Kiken na Futari! Super...,"After his loss to Goku, Broly crash lands and ...",https://myanimelist.net/anime/903/Dragon_Ball_...
5664,Dragon Ball Z: Summer Vacation Special,"One peaceful afternoon, the Son family and fri...",https://myanimelist.net/anime/22695/Dragon_Bal...


### 2.2) Conjunctive query & Ranking score
#### 2.2.1) Inverted index

In [12]:
def get_tfidf(word, doc, corpus, idf=None):
    tf = doc.count(word) / len(doc)
    counter_docs = 0
    # if the idf parameter has not been provided, we compute it
    if idf == None:
        for text in corpus:
            if word in text:
                counter_docs += 1
        idf = np.log(len(corpus) / counter_docs)
    tfidf = tf * idf
    return idf, tfidf

In [82]:
def second_inverted_index(corpus, voc):
    inverted_index_2 = dict()
    # first, we initialize each field in the inverted_index
    for term_id in voc.values() :
        inverted_index_2[term_id]=list()
        
    idf_calculated = {} # the idfs can be calculated once for each word since idf = np.log(len(corpus) / documents_with_word)
    
    for doc, num_doc in zip(corpus,range(len(corpus))) :
        words_checked = []
        for word in doc:
            if word not in words_checked: # to avoid looking for the same word more than once in the same doc
                term_id = voc[word]
                # if this is the first time we encounter this word, we need to obtain the idf and save it for future use
                if word not in idf_calculated.keys():
                    idf, tfidf = get_tfidf(word, doc, corpus)
                    idf_calculated[word] = idf
                # otherwise, we provide it to the function directly
                else:
                    _, tfidf = get_tfidf(word, doc, corpus, idf)
                # we add the doc index and the tfidf score to the dictionary
                inverted_index_2[term_id].append([num_doc, tfidf])
                # we mark this word as "checked" for this document
                words_checked.append(word)
    
    # we order the items by tfidf score for that term
    for term_id, docs in inverted_index_2.items() :
        inverted_index_2[term_id]=sorted(inverted_index_2[term_id], key=lambda x: x[1])
    
    # finally we save the item in order to avoid having to create the index again
    with open ("inverted_index_2.p", "wb") as file:
        pickle.dump(inverted_index_2, file)
    return inverted_index_2, idf_calculated # we also return the calculated idfs so to avoid calculating them over and over

In [83]:
if "inverted_index_2.p" not in os.listdir():
    ii_2, idfs = second_inverted_index(corpus, voc)
else:
    with open ("inverted_index_2.p", "rb") as file:
        ii_2 = pickle.load(file)

In [84]:
def cosine_similarity(vec1, vec2):
    num = np.dot(vec1, vec2)
    denom = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    cos = num / denom
    return cos

In [207]:
import heapq

# implement the case in which the query is not in any document

def search_k_matches(query, corpus, voc, ii, idfs, k=10):
    df = pd.read_csv("./html_df.csv")
    query = process_text(query.lower())
    dict_relevant = {}
    for word in query:
        if word in voc.keys(): # checks if query is in our vocabulary
            term_id = voc[word]
            for doc_info in ii[term_id]:
                if doc_info[0] not in dict_relevant.keys():
                    dict_relevant[doc_info[0]] = []
                dict_relevant[doc_info[0]].append(doc_info[1])
    len_query = len(query)
    query_vector = [(query.count(x) / len_query) * idfs[x] for x in query if x in idfs.keys()] # we treat the query as a document
    distances = []
    for key in dict_relevant.keys():
        vector = dict_relevant[key]
        if len(vector) == len(query_vector): # this assures the conjuctive (and) property of the search engine
            distances.append((-cosine_similarity(query_vector, vector), key)) # negative of cosine_similarity to get max heap
    heapq.heapify(distances)
    n_matches = len(distances)
    final_results = []
    for i in range(min(k, n_matches)):
        el = heapq.heappop(distances)
        final_results.append([el[1], -el[0]]) # make the cosine distance positive again for the output
    
    #print(final_results)
    indices = [x[0] for x in final_results]
    distances = [x[1] for x in final_results]
    cols = ["animeTitle", "animeDescription"]
    partial_df = df.iloc[indices][cols]
    final_df = partial_df.assign(Url=[urls[i] for i in indices],
                                 Similarity=distances)
    return final_df

Here we test it against the keywords "ranma urusei" which are two characters from two different animes that only meet in a special crossover episode called "It's a Rumic World". We can see that the search engine correctly gives this result as the best match, followed by another Ranma 1/2 episode.

In [208]:
query = "ranma urusei"
output = search_k_matches(query, corpus, voc, ii_2, idfs)

In [209]:
output

Unnamed: 0,animeTitle,animeDescription,Url,Similarity
4286,It's a Rumic World: 50th Anniversary Weekly★Sh...,"The characters from Ranma 1/2, Urusei Yatsura,...",https://myanimelist.net/anime/6566/Its_a_Rumic...,0.997792
1160,Ranma ½ Super,Super OVA 1: Based on a story from vol. 27 of ...,https://myanimelist.net/anime/1011/Ranma_½_Super,0.910875


## 3. Define a new score!


***Currently not working, cosine similarity doesn't behave well with 0s, will try to think of a solution tomorrow morning***

The idea here is to obtain a query with several parameters, like for example year of release, voice actors, genre and so on. This query can be obtained either directly in the format "query [parameter1=x, parameter2=y...]" or through a form that assembles the same string so that the inner logic of the algorithm doesn't change either way.<br><br>
The function process_query then builds a dictionary for the query by extrapolating the parameters through regular expressions and returns this dictionary to the main function.<br><br>
Now, in order to come up with some score for the matches in this parameters, I thought of using the cosine distance again and treat the parameters as additional dimensions. Meaning that if in the previous search engine (the one with the tfidf) we had for each doc a vector of tfidfs, now we have the same thing but also ones or zeros depending if there is a match in the parameters (i.e.: [3.60997786640952, 4.1247875750000995, 1, 0, 1, 1]).
Then we compute the cosine distance from the query as before (and we treat the query as its own document again).
This way we have a cosine distance that takes into account the additional parameters.<br><br>
It's very rough, and it just allows for a binary definition of the parameters (either they match or they don't). I know that for certain parameters we could probably achieve a more refined scoring, but it seemed like a lot of work.

In [248]:
# Thinking of a query like "sayian race [year=1999 episodes=22]". So first main query and then in squared parentheses
# the parameters.

# processes the query in its single components
def process_query(query):
    query_dict = dict()
    main_query = re.search("^(.+)\[", query)
    anime_voices = re.search("voices=\(([^\)]+)\)", query)
    anime_chars = re.search("characters=\(([^\)]+)\)", query)
    anime_related = re.search("related=\(([^\)]+)\)", query)
    year = re.search("year=([0-9]+)[,\]]", query)
    anime_type = re.search("type=([a-zA-Z]+)[,\]]", query)
    
    if year:
        query_dict["release_year"] = year.groups()[0]
    if anime_type:
        query_dict["anime_type"] = anime_type.groups()[0]
    # transform these fields in lists before putting them in the dictionary
    if anime_voices:
        anime_voices = anime_voices.groups()[0]
        query_dict["anime_voices"] = [x for x in anime_voices.strip().split(",")]
    if anime_chars:
        anime_chars = anime_chars.groups()[0]
        query_dict["anime_characters"] = [x for x in anime_chars.strip().split(",")]
    if anime_related:
        anime_related = anime_related.groups()[0]
        query_dict["anime_related"] = [x for x in anime_related.strip().split(",")]
    
    # preprocesses main query before putting it in the dictionary
    query_dict["main_text_query"] = process_text(main_query.groups()[0])
    #print(query_dict)
    
    return query_dict
    

# we need to treat the main query as it was its own document (I found references online on this).
# This is the same thing i implemented in 2.2.2, I just put it in a function here.
def evaluate_main_query(query, corpus, voc, ii, idfs):
    dict_relevant = {}
    for word in query:
        if word in voc.keys(): # checks if query is in our vocabulary
            term_id = voc[word]
            for doc_info in ii[term_id]:
                if doc_info[0] not in dict_relevant.keys():
                    dict_relevant[doc_info[0]] = []
                dict_relevant[doc_info[0]].append(doc_info[1])
                
    return dict_relevant
    
# Here we append to the vector made of tfidfs the extra values obtained from the parameterized queries
def evaluate_parameters(query_d, vector, df, anime_num):
    relevant_row = df.iloc[anime_num]
    for dict_key in sorted(query_d.keys()):
        if dict_key == "release_year":
            year = datetime.datetime.strptime(relevant_row["releaseDate"], "%Y-%m-%d %H:%M:%S").year
            vector.append(int(str(year) == query_d[dict_key])) # evaluates the boolean to an integer
            
        elif dict_key == "anime_type":
            anime_type = relevant_row["animeType"]
            vector.append(int(anime_type.lower() == query_d[dict_key].lower()))
            
        elif dict_key == "anime_characters":
            chars = relevant_row["animeCharacters"].lower()
            matches = 0
            for char_query in query_d[dict_key]:
                char_query = char_query.lower()
                if char_query in chars:
                    matches += 1
            score = matches / len(query_d[dict_key])
            vector.append(score)
            
        elif dict_key == "anime_related":
            related = relevant_row["animeRelated"].lower()
            matches = 0
            for rel_anime_query in query_d[dict_key]:
                rel_anime_query = rel_anime_query.lower()
                if rel_anime_query in related:
                    matches += 1
            score = matches / len(query_d[dict_key])
            vector.append(score)
            
        elif dict_key == "anime_voices":
            voices = relevant_row["animeVoices"].lower()
            matches = 0
            for voices_query in query_d[dict_key]:
                voices_query = voices_query.lower()
                if voices_query in voices:
                    matches += 1
            score = matches / len(query_d[dict_key])
            vector.append(score)
        
    return vector

# Here we obtain the query via form
def get_query_with_form():
    query_d = dict()
    main_query = input("Enter your query: ")
    year = input("Year it was released: ")
    anime_type = input("Type of anime: ")
    voices = input("Voice actors: ")
    characters = input("Characters: ")
    related = input("Related animes: ")
    
    query_string = (f"{main_query} [year={year}, anime_type={anime_type}, related=({related}),"
                    f"voices=({voices}), characters=({characters})]")
    
    #print(query_string)
    
    return query_string


def search_k_matches_2(corpus, voc, ii, idfs, query=None, k=10):
    df = pd.read_csv("./html_df.csv")
    if not query:
        query = get_query_with_form()
    query_dict = process_query(query)
    # here we process the main query as it was its own document with perfect match parameters.
    main_query = query_dict["main_text_query"]
    len_query = len(main_query)
    query_vector = [(main_query.count(x) / len_query) * idfs[x] for x in main_query if x in idfs.keys()] # we treat the query as a document
    query_vector += [1]*(len(query_dict.keys())-1)
    #print(query_vector)
    
    distances = []
    dict_relevant = evaluate_main_query(main_query, corpus, voc, ii, idfs)
    
    for key in dict_relevant.keys():
        vector = dict_relevant[key]
        vector = evaluate_parameters(query_dict, vector, df, key)
        if len(vector) == len(query_vector): # this assures the conjuctive (and) property of the search engine
            distances.append((-cosine_similarity(query_vector, vector), key)) # negative of cosine_similarity to get max heap
    heapq.heapify(distances)
    n_matches = len(distances)
    final_results = []
    for i in range(min(k, n_matches)):
        el = heapq.heappop(distances)
        final_results.append([el[1], -el[0]]) # make the cosine distance positive again for the output
    
    #print(final_results)
    indices = [x[0] for x in final_results]
    distances = [x[1] for x in final_results]
    cols = ["animeTitle", "animeDescription"]
    partial_df = df.iloc[indices][cols]
    final_df = partial_df.assign(Url=[urls[i] for i in indices],
                                 Similarity=distances)
    return final_df

In [249]:
# example with query provided
query = "ranma urusei [year=2008, type=special, voices=(gianni), related=(gemelli del destino), characters=(ranma)]"
search_k_matches_2(corpus, voc, ii_2, idfs, query=query)

Unnamed: 0,animeTitle,animeDescription,Url,Similarity
4286,It's a Rumic World: 50th Anniversary Weekly★Sh...,"The characters from Ranma 1/2, Urusei Yatsura,...",https://myanimelist.net/anime/6566/Its_a_Rumic...,0.536458
1160,Ranma ½ Super,Super OVA 1: Based on a story from vol. 27 of ...,https://myanimelist.net/anime/1011/Ranma_½_Super,0.357107


In [239]:
# example qith query via form
search_k_matches_2(corpus, voc, ii_2, idfs)

Enter your query: ranma urusei
Year it was released: 2010
Type of anime: special
Voice actors: gianni
Characters: ranma
Related animes: 
[3.60997786640952, 4.1247875750000995, 1, 1, 1, 1]


Unnamed: 0,animeTitle,animeDescription,Url,Similarity
4286,It's a Rumic World: 50th Anniversary Weekly★Sh...,"The characters from Ranma 1/2, Urusei Yatsura,...",https://myanimelist.net/anime/6566/Its_a_Rumic...,0.539927
1160,Ranma ½ Super,Super OVA 1: Based on a story from vol. 27 of ...,https://myanimelist.net/anime/1011/Ranma_½_Super,0.362313


In [242]:
search_k_matches_2(corpus, voc, ii_2, idfs)

Enter your query: ranma urusei
Year it was released: 2010
Type of anime: sdventure
Voice actors: gianni
Characters: sandro
Related animes: 
[3.60997786640952, 4.1247875750000995, 1, 1, 1, 1]
[0.15928001406847617, 0.0, 0, 0.0, 0]
[0.16162316495793932, 0.0, 0, 0.0, 0]
[0.17991495165333934, 0.0, 0, 0.0, 0]
[0.19668099734300482, 0.0, 0, 0.0, 0]
[0.2136123731528463, 0.09705382529411999, 0.0, 0, 0.0, 0]
[0.23473840624843567, 0.0, 0, 0.0, 0]
[0.2499871257575818, 0.0, 0, 0.0, 0]
[0.32863376874781, 0.0, 0, 0.0, 0]
[0.3521076093726535, 0.3521076093726535, 0.0, 0, 0.0, 0]
[0.3683650884091347, 0.0, 0, 0.0, 0]
[0.39436052249737197, 0.0, 0, 0.0, 0]
[0.4579847946656437, 0.0, 0, 0.0, 0]
[0.4579847946656437, 0.0, 0, 0.0, 0]
[1.0563228281179604, 0.0, 0, 0.0, 0]
[0.16529058063709792, 0.0, 0, 0.0, 0]
[0.24647532656085747, 0.0, 0, 0.0, 0]
[0.30809415820107183, 0.0, 0, 0.0, 0]


Unnamed: 0,animeTitle,animeDescription,Url,Similarity
4286,It's a Rumic World: 50th Anniversary Weekly★Sh...,"The characters from Ranma 1/2, Urusei Yatsura,...",https://myanimelist.net/anime/6566/Its_a_Rumic...,0.937347
1160,Ranma ½ Super,Super OVA 1: Based on a story from vol. 27 of ...,https://myanimelist.net/anime/1011/Ranma_½_Super,0.855695


In [224]:
df.iloc[4286]

Unnamed: 0                                                       4286
animeTitle          It's a Rumic World: 50th Anniversary Weekly★Sh...
animeType                                                     Special
animeNumEpisode                                                   1.0
releaseDate                                       2008-07-30 00:00:00
endDate                                           2008-08-11 00:00:00
animeNumMembers                                                  3682
animeScore                                                       6.88
animeUsers                                                     1914.0
animeRank                                                      4286.0
animePopularity                                                  7858
animeDescription    The characters from Ranma 1/2, Urusei Yatsura,...
animeRelated                ['Ranma ½', 'InuYasha', 'Urusei Yatsura']
animeCharacters     ['Inuyasha', 'Higurashi, Kagome', 'Saotome, Ra...
animeVoices         

## 5. Algorithmic question
You consult for a personal trainer who has a *back-to-back sequence* of requests for appointments. A sequence of requests is of the form
    > 30, 40, 25, 50, 30, 20
where each number is the time that the person who makes the appointment wants to spend.
You need to accept some requests, however you need a break between them, so you cannot accept two consecutive requests. For example, `[30, 50, 20]` is an acceptable solution (of duration *100*), but `[30, 40, 50, 20]` is not, because *30* and *40* are two consecutive appointments. Your goal is to provide to the personal trainer a schedule that maximizes the total length of the accepted appointments. For example, in the previous instance, the optimal solution is `[40, 50, 20]`, of total duration *110*.
1. Write an algorithm that computes the acceptable solution with the longest possible duration.
2. Implement a program that given in input an instance in the form given above, gives the optimal solution.


### 1. Write an algorithm that computes the acceptable solution with the longest possible duration.

##### Here we consider that all values in the instance are unique and len(instance) = n

To compute the acceptable solution with the longest possible duration, we have to follow several steps :
1. Compute every possible solution : So for that, list all sublists which represent each possible list of appointments.
2. For each sublist, tell if it is acceptable or not, so if there are two consecutive appointments or not.
3. Compute the total duration of each acceptable solution.
4. Finally, return the solution which correspond to the maximum duration.

```
Input: 
    instance: list of length n

function optimal_solution(instance):
    n=len(instance)
    for i=0 to n: 
        sublists = sublists + [all sublists with i elements]
    
    acceptable_solutions=[all element of sublists which are acceptable]
    
    durations = [duration of each element of acceptable_solutions]
    max_duration = max(durations)
    optimal_solutions = [sublists of instance with total duration == max_durations]
    
    return optimal_solutions, max_duration
```

### 2. Implement a program that given in input an instance in the form given above, gives the optimal solution.

First of all, we create a function ```is_acceptable(solution, instance)``` that says if a solution is acceptable or not, i.e. there is not two consecutive requests of the instance in the solution.

Then, the ```longest_acceptable_duration(instance)``` compute all possible sublists of the instance, test it with the function ```is_acceptable(solution, instance)```, sum every acceptable list (to compute the duration of each acceptable solution) and return the maximum. 

In [142]:
def is_acceptable(solution, instance):
    res=True 
    for x,y in zip(solution[:-1],solution[1:]):
        i= instance.index(x)
        #index1 = np.where(np.array(instance) == x)
        #for i in index[0]:
        #if the next element is y 
        if instance[i+1] == y:
            res = False
    return res

def optimal_solution(instance):
    sublists=[]
    
    for i in range(1, len(instance)+1):
        sublists+=[list(x) for x in combinations(instance, i)]
        
    mask = [is_acceptable(solution, instance) for solution in sublists]
    acceptable_sol = np.array(sublists)[mask]
    
    durations = [sum(L) for L in acceptable_sol]
    max_duration = max(durations)
    
    index_optimal_solutions = np.where(np.array(durations)==max_duration)
    
    return list(np.array(acceptable_sol)[index_optimal_solutions]), max_duration

In [143]:
instance = [30, 40, 25, 50, 30, 20]
optimal_solutions, max_duration = optimal_solution(instance)
print(optimal_solutions, max_duration)

[[40, 50, 20]] 110
