# Homework 3 - What is the best anime in the world?

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import os
import pickle
import numpy as np
import time
import re
import datetime as dt 
import csv
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

## 1. Data collection

### 1.1. Get the list of animes

In [2]:
def take_n_urls(n):

    main_url = "https://myanimelist.net/topanime.php"

    # this list will contain all the urls we'll retrieve
    urls = [] 

    # each page shows 50 elements and we can retrieve each page by manipulating the "limit" query
    for limit in range(0, n, 50): 
        content = requests.get(main_url,
                               params={"limit": limit})
        if content.status_code == 404:
            print(f"Page with limit {limit} was not found. Interrumpting and returning pages found")
            break
        soup = bs(content.content, "html.parser")

        # from the content of each page we retrieve the portions that contain the urls
        results = soup.find_all("a", 
                                class_= "hoverinfo_trigger fl-l ml12 mr8")

        # finally, we take the string containing each url by taking the attribute href,
        # and we append them in the urls list
        for result in results:
            url = result["href"]
            if url not in urls:  # check for duplicates
                urls.append(url)

    return urls

In [3]:
if "urls.txt" not in os.listdir():
    urls = take_n_urls(20000)
    # Since the output of this step has to be a txt file, here we write one with each
    # url separated by a newline
    with open("urls.txt", "w") as file:
        file.write("\n".join(urls_str))
else:
    with open("urls.txt", "r", encoding="utf8") as file:
        print("Loading urls...")
        urls = file.read().split("\n")
        print("Done!")

Loading urls...
Done!


In [4]:
# we end up with 19131 urls. 
# I added a check that tells us when we have exceeded the length of the ranking list and returns what has been found
# up until that moment (so to avoid losing any more time with get requests that point to nothing)
# I know in the assignment they said 20000 but I'm fairly sure that's all the entries. 
# This is easy to see if we manually set the limit in the url and check the results. 
# For example: https://myanimelist.net/topanime.php?limit=15000 contains rankings 15001-15050. The first entry is
# Big X Episode 0. If we check our list with urls[15000] (remember that our list is 0-indexed) we obtain the same result.
# This to me seems to point to a correct behavior from the function, but let me know what you think.

print(len(urls)) 
print(urls[15000])
urls_str = list(map(str, urls))

19130
https://myanimelist.net/anime/30839/Big_X_Episode_0


### 1.2. Crawl animes

Attention: the index of article start with 0 and not 1 so all ranks are shifted by 1 position

In [27]:
# Here we create the directory where the html pages will be stored
if "html_pages" not in os.listdir():
    os.mkdir("html_pages")

In [39]:
def save_html_pages(urls):
    if "counter_pages" not in os.listdir():
        start = 0
    else:
        with open("counter_pages", "rb") as counter_file:
            start = pickle.load(counter_file) + 1

    print(f"Starting from anime #{start}")
    n = len(urls)
    for i in range(start, n):
        ranking_page = str(int(np.floor(i/50)))
        if i % 50 == 0 or f"ranking_page_{ranking_page}" not in os.listdir("./html_pages"):
            os.mkdir(f"html_pages/ranking_page_{ranking_page}")
        html_page = requests.get(urls[i])
        sleep_timer = 60
        while html_page.status_code != 200: # if the status_code is not 200, we've exceeded the number of requests and have to wait
            print(f"Exceeded number of requests while retrieving page #{i}.\nWaiting {sleep_timer} seconds")
            html_page.close()
            time.sleep(sleep_timer)
            html_page = requests.get(urls[i])
            sleep_timer += 10
        with open (f"html_pages/ranking_page_{ranking_page}/article_{i}.html", "w", encoding="utf-8") as file:
            file.write(html_page.text)
        with open ("counter_pages", "wb") as counter_file:
            pickle.dump(i, counter_file)


In [98]:
save_html_pages(urls)

Starting from anime #18306
Exceeded number of requests while retrieving page #18512.
Waiting 60 seconds
Exceeded number of requests while retrieving page #18512.
Waiting 70 seconds
Exceeded number of requests while retrieving page #18512.
Waiting 80 seconds
Exceeded number of requests while retrieving page #18512.
Waiting 90 seconds
Exceeded number of requests while retrieving page #18725.
Waiting 60 seconds
Exceeded number of requests while retrieving page #18725.
Waiting 70 seconds
Exceeded number of requests while retrieving page #18725.
Waiting 80 seconds
Exceeded number of requests while retrieving page #18725.
Waiting 90 seconds
Exceeded number of requests while retrieving page #18905.
Waiting 60 seconds
Exceeded number of requests while retrieving page #18905.
Waiting 70 seconds
Exceeded number of requests while retrieving page #18905.
Waiting 80 seconds
Exceeded number of requests while retrieving page #18905.
Waiting 90 seconds
Exceeded number of requests while retrieving page

### 1.3 Parse downloaded pages

At this point, you should have all the html documents about the animes of interest and you can start to extract the animes informations. The list of information we desire for each anime and their format is the following:

1. Anime Name (to save as `animeTitle`): String
2. Anime Type (to save as `animeType`): String
3. Number of episode (to save as `animeNumEpisode`): Integer
4. Release and End Dates of anime (to save as `releaseDate` and `endDate`): Convert both release and end date into datetime format.
5. Number of members (to save as `animeNumMembers`): Integer
6. Score (to save as `animeScore`): Float
7. Users (to save as `animeUsers`): Integer
8. Rank (to save as `animeRank`): Integer
9. Popularity (to save as `animePopularity`): Integer
10. Synopsis (to save as `animeDescription`): String
11. Related Anime (to save as `animeRelated`): Extract all the related animes, but only keep unique values and those that have a hyperlink associated to them. List of strings.
12. Characters (to save as `animeCharacters`): List of strings.
13. Voices (to save as `animeVoices`): List of strings
14. Staff (to save as `animeStaff`): Include the staff name and their responsibility/task in a list of lists.


In [316]:
# Here we create the directory where the tsv files will be stored
if "tsv_files" not in os.listdir():
    os.mkdir("tsv_files")

In [31]:
path_ex ='C:/Users/aurel/OneDrive/Bureau/IMT/3ème année IMT/0_Cours Sapienza/ADM/Homework/Homework 3'


def collect_info(num_article, folder='tsv_files'):
    ranking_page = str(int(np.floor(num_article/50)))
    article=f'{path_ex}/html_pages/ranking_page_{ranking_page}/article_{num_article}.html'
    with open(article, "r", encoding="utf-8") as file:
        art= bs(file.read(), 'html.parser')
    
    #animeTitle
    animeTitle = art.find('h1', {'class':"title-name h1_bold_none"}).string
    #print('animeTitle :',animeTitle)
    
    
    #animeType
    animeType = art.find('span', {'class':"information type"}).string
    #print('animeType :',animeType)
    
    
    #animeNumEpisode and Dates (there is not specific name for those two info)
    #list lines with tag <div class="spaceit_pad">
    lines = art.find_all('div', {'class':"spaceit_pad"})
    for line in lines :
        #for each div tag there is one span, so here we look for the span tag with 'Episodes:' and 'Aired'
        sp= line.find('span', {'class':"dark_text"})
        # to avoid error if there is no span
        if sp is not None :
            #for span 'Episodes' (and the div tag which corresponds)
            if sp.string == 'Episodes:' :
                #extract the content of the right div tag and take the third line which correspond to the number of episodes
                if line.contents[2] != '\n  Unknown\n  ' :
                    animeNumEpisode = int(line.contents[2])
                    #animeNumEpisode = int(re.findall(r'-?\d+\.?\d*', str(line))[0])           #if we want to use regex  
                else :
                    animeNumEpisode = ''
            #for span 'Aired' (and the div tag which corresponds)
            if sp.string == 'Aired:' :
                str_dates = line.contents[2].split('\n  ')[1]
                #if "Status: Finished Airing" (there is a endDate)
                if ('to' in str_dates) and ('?' not in str_dates):
                    #extract the content of the right div tag and take the third line which correspond to the dates (fix the issue of '\n')
                    str_releaseDate, str_endDate = str_dates.split(' to ')
                    #convert into a datetime
                    date_format = "%b %d, %Y"
                    releaseDate, endDate = dt.datetime.strptime(str_releaseDate, date_format), dt.datetime.strptime(str_endDate, date_format)
                else :
                    str_releaseDate = str_dates.split(' to ')[0]
                    #convert into a datetime
                    date_format = "%b %d, %Y"
                    releaseDate = dt.datetime.strptime(str_releaseDate, date_format)
                    endDate=''
    #print('animeNumEpisode :',animeNumEpisode)
    #print('releaseDate :',releaseDate)
    #print('endDate :',endDate)
    
    
    #animeNumMembers
    animeNumMembers = int(art.find('span', {'class':"numbers members"}).contents[1].string.replace(',',''))
    #print('animeNumMembers :',animeNumMembers)
    
    
    #animeScore
    animeScore = float(art.find('div', {'class':"score-label"}).string)
    #print('animeScore :',animeScore)
    
    
    #animeUsers
    animeUsers = int(art.find('span', {'itemprop':"ratingCount"}).string)
    #print('animeUsers :',animeUsers)
    
    
    #animeRank
    animeRank = int(art.find('span', {'class':"numbers ranked"}).contents[1].string.replace('#',''))
    #print('animeRank :',animeRank)
    
    
    #animePopularity
    animePopularity = int(art.find('span', {'class':"numbers popularity"}).contents[1].string.replace('#',''))
    #print('animePopularity :',animePopularity)
    
    
    #animeDescription
    desc = art.find('p', {'itemprop':"description"}).contents
    animeDescription=''
    #remove <br/> Tag and '\n'
    for ele in desc :
        #delete tags with regex 
        ele = re.sub(re.compile('<.*?>'),'', str(ele))
        animeDescription += ele
        animeDescription = animeDescription.replace('\n','')
    #print('animeDescription :',animeDescription.replace('\n',''))
    
    
    #animeRelated
    animeRelated = []
    #store the table which contain related animes
    table = art.find('table', {'class':"anime_detail_related_anime"})
    if table is not None :
        #store all links/anime related with 'a' Tag
        links = table.find_all('a')
        for link in links :
            # check if there is a hyperlink and add it in the list if yes 
            if (link.get('href') is not None) and (link.string is not None):
                animeRelated += [link.string]
        animeRelated=list(set(animeRelated))
    else :
        animeRelated=''
    #print('animeRelated :',animeRelated)

    
    #animeCharacters
    animeCharacters = art.find_all('h3', {'class':"h3_characters_voice_actors"})
    animeCharacters = [char.string for char in animeCharacters]
    #print('animeCharacters :',animeCharacters)
    
    
    #animeVoices
    td_Voices = art.find_all('td', {'class':"va-t ar pl4 pr4"})
    animeVoices = [voice.find('a').string for voice in td_Voices]
    #print('animeVoices :',animeVoices)
    
    
    #animeStaff
    #if there is a staff, the div which correspond to the table Staff is the second one (there are div with {'class':"detail-characters-list clearfix"})
    if len(art.find_all('div', {'class':"detail-characters-list clearfix"}))>1 :
        div_staff = art.find_all('div', {'class':"detail-characters-list clearfix"})[1] 
        td_staff = div_staff.find_all('td', {'class':"borderClass"})
        animeStaff=[]
        for td in td_staff :
            if td.get('width') == None:
                animeStaff.append([td.find('a').string,td.find('small').string])
    #if there is not staff
    else :
        animeStaff = ''
    #print('animeStaff :',animeStaff)
    
    #create a .tsv file with attributes
    with open(f'{folder}/anime_{num_article}', 'wt', encoding="utf8") as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow([animeTitle, animeType, animeNumEpisode, releaseDate, endDate, animeNumMembers, \
                            animeScore, animeUsers, animeRank, animePopularity, animeDescription, animeRelated, \
                            animeCharacters, animeVoices, animeStaff])
    

In [32]:
for i in range(len(urls)):
    collect_info(i)


AttributeError: 'NoneType' object has no attribute 'string'

In [33]:
i

278

## 2. Search Engine

### 2.0. Pre-processing

1. Removing stopwords
2. Removing punctuation
3. Stemming
4. Anything else you think it's needed

In [26]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aurel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aurel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [14]:
# For stemming
porter = nltk.stem.PorterStemmer()
lancaster = nltk.stem.LancasterStemmer()

# For identifying the stop words
eng_stopwords = stopwords.words("english")

In [80]:
sorted(eng_stopwords)


['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [16]:
def process_text(text):
    try:
        tokenized = nltk.word_tokenize(text)
        stemmed = [porter.stem(word) for word in tokenized if ((word.lower() not in eng_stopwords) and (word not in string.punctuation))]
    except TypeError as e:
        print(text)
        raise TypeError
    return stemmed

In [118]:
def merge_tsvs(path, colnames):
    files = sorted(os.listdir(path))
    df = pd.read_csv(path+files[0],
                     names=colnames,
                     sep="\t", engine='python')
    for file_name in files[1:]:
        df2 = pd.read_csv(path+file_name,
                          names=colnames,
                          sep="\t", engine='python')
        df = pd.concat([df, df2], ignore_index=True)
    return df

In [119]:
path = "./tsv_files/"
colnames = ["animeTitle", "animeType", "animeNumEpisode", "releaseDate", "endDate", "animeNumMembers",
            "animeScore", "animeUsers", "animeRank", "animePopularity", "animeDescription", "animeRelated",
            "animeCharacters", "animeVoices", "animeStaff"]
df = merge_tsvs(path, colnames)

# Save our df in csv format
df.to_csv("./html_df.csv", index=False)

In [120]:
# create a copy of our dataframe with an extra column containing the preprocessed synopsis
df_new = df.assign(tokenized_desc=df["animeDescription"].apply(lambda x: process_text(x)))

In [121]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   animeTitle        31 non-null     object 
 1   animeType         31 non-null     object 
 2   animeNumEpisode   31 non-null     int64  
 3   releaseDate       31 non-null     object 
 4   endDate           24 non-null     object 
 5   animeNumMembers   31 non-null     int64  
 6   animeScore        31 non-null     float64
 7   animeUsers        31 non-null     int64  
 8   animeRank         31 non-null     int64  
 9   animePopularity   31 non-null     int64  
 10  animeDescription  31 non-null     object 
 11  animeRelated      30 non-null     object 
 12  animeCharacters   31 non-null     object 
 13  animeVoices       31 non-null     object 
 14  animeStaff        30 non-null     object 
 15  tokenized_desc    31 non-null     object 
dtypes: float64(1), int64(5), object(10)
memory usa

In [125]:
with open("df_with_tokens.p", "wb") as file:
    pickle.dump(df_new, file)

### 2.1. Conjunctive query
#### 2.1.1) Create your index!

Before building the index, 
* Create a file named `vocabulary`, in the format you prefer, that maps each word to an integer (`term_id`).

Then, the first brick of your homework is to create the Inverted Index. It will be a dictionary of this format:

```
{
term_id_1:[document_1, document_2, document_4],
term_id_2:[document_1, document_3, document_5, document_6],
...}
```
where _document\_i_ is the *id* of a document that contains the word.


In [37]:
# First i chose to create the vocabulary file as a DataFrame store as a .csv file 

def create_vocabulary(corpus):
    voc = set()
    for doc in corpus :
        voc = voc.union(set(doc))
    
    df_voc = pd.DataFrame({'term_id': range(len(voc)), 'term': sorted(voc)})
    df_voc.to_csv('vocabulary.csv')
    return df_voc

In [38]:
#example
L=['A','C', 'B', 'A', 'E']
corpus=[L]+[['B','D']]
s=create_vocabulary(corpus)

In [45]:
def inverted_index(corpus,voc):
    #create a inverted_index "empty", i.e. only with term_id of vocabulary
    inverted_index = dict()
    for term_id in voc['term_id'] :
        inverted_index[term_id]=set()
        #inverted_index['term_id_'+str(term_id)]=set()

    for doc, num_doc in zip(corpus,range(len(corpus))) :
        for word in doc :
            term_id = voc[voc.term==word].reset_index()['term_id'][0]
            inverted_index[term_id]=inverted_index[term_id].union(set([num_doc]))
            #inverted_index['term_id_'+str(term_id)]=inverted_index['term_id_'+str(term_id)].union(set(['document_'+str(num_doc)]))
    #print(inverted_index)
    
    for term_id, docs in inverted_index.items() :
        inverted_index[term_id]=list(inverted_index[term_id])
    
    #save the inverted_index as a .npy file 
    np.save('inverted_index.npy', inverted_index)

In [46]:
#test
voc = pd.read_csv('vocabulary.csv')
inverted_index(corpus,voc)

new_dict = np.load('inverted_index.npy', allow_pickle='TRUE')
print(new_dict)

{0: [0], 1: [0, 1], 2: [0], 3: [1], 4: [0]}


In [67]:
#test
#df_synopsis = pd.read_csv("./temporary.csv")
#df_synopsis['tokenized_desc'][0] #['tokenized_desc'][0]

corpus = list(df_new['tokenized_desc'])
voc = create_vocabulary(corpus)

In [72]:
corpus = list(df_new['tokenized_desc'])
voc = create_vocabulary(corpus)

inverted_index(corpus,voc)

new_dict = np.load('inverted_index.npy', allow_pickle='TRUE')
print(new_dict)

{0: [0, 2, 3, 259, 261, 263, 13, 14, 15, 270, 271, 273, 275, 33, 35, 37, 39, 46, 49, 51, 57, 58, 64, 65, 72, 74, 76, 88, 90, 93, 95, 102, 105, 111, 112, 114, 116, 118, 121, 124, 125, 133, 138, 141, 142, 145, 147, 148, 151, 153, 165, 170, 171, 177, 185, 187, 189, 201, 202, 204, 206, 209, 210, 218, 221, 230, 235, 236, 239, 240, 242, 244, 245, 248, 250, 255], 1: [105], 2: [208, 244, 276], 3: [227, 149], 4: [0, 1, 2, 3, 5, 7, 8, 10, 11, 12, 14, 15, 16, 17, 18, 22, 25, 26, 28, 32, 33, 34, 35, 36, 37, 38, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 55, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 68, 70, 71, 72, 73, 74, 75, 79, 81, 83, 85, 89, 90, 91, 93, 94, 95, 96, 98, 101, 102, 103, 105, 106, 107, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 124, 126, 127, 128, 130, 132, 133, 134, 135, 141, 142, 143, 144, 146, 147, 148, 149, 151, 152, 153, 157, 160, 161, 162, 163, 164, 165, 168, 169, 170, 171, 173, 179, 180, 183, 184, 185, 187, 188, 189, 190, 191, 192, 195, 196, 197, 198, 1

In [73]:
voc

Unnamed: 0,term_id,term
0,0,''
1,1,'05
2,2,'ll
3,3,'re
4,4,'s
...,...,...
5757,5757,zoldyck.hunt
5758,5758,â€
5759,5759,â€œa
5760,5760,â€�


## 5. Algorithmic question
You consult for a personal trainer who has a *back-to-back sequence* of requests for appointments. A sequence of requests is of the form
    > 30, 40, 25, 50, 30, 20
where each number is the time that the person who makes the appointment wants to spend.
You need to accept some requests, however you need a break between them, so you cannot accept two consecutive requests. For example, `[30, 50, 20]` is an acceptable solution (of duration *100*), but `[30, 40, 50, 20]` is not, because *30* and *40* are two consecutive appointments. Your goal is to provide to the personal trainer a schedule that maximizes the total length of the accepted appointments. For example, in the previous instance, the optimal solution is `[40, 50, 20]`, of total duration *110*.
1. Write an algorithm that computes the acceptable solution with the longest possible duration.
2. Implement a program that given in input an instance in the form given above, gives the optimal solution.


### 1. Write an algorithm that computes the acceptable solution with the longest possible duration.

##### Here we consider that all values in the instance are unique and len(instance) = n

To compute the acceptable solution with the longest possible duration, we have to follow several steps :
1. Compute every possible solution : So for that, list all sublists which represent each possible list of appointments.
2. For each sublist, tell if it is acceptable or not, so if there are two consecutive appointments or not.
3. Compute the total duration of each acceptable solution.
4. Finally, return the solution which correspond to the maximum duration.

```
Input: 
    instance: list of length n

function optimal_solution(instance):
    n=len(instance)
    for i=0 to n: 
        sublists = sublists + [all sublists with i elements]
    
    acceptable_solutions=[all element of sublists which are acceptable]
    
    durations = [duration of each element of acceptable_solutions]
    max_duration = max(durations)
    optimal_solutions = [sublists of instance with total duration == max_durations]
    
    return optimal_solutions, max_duration
```

### 2. Implement a program that given in input an instance in the form given above, gives the optimal solution.

First of all, we create a function ```is_acceptable(solution, instance)``` that says if a solution is acceptable or not, i.e. there is not two consecutive requests of the instance in the solution.

Then, the ```longest_acceptable_duration(instance)``` compute all possible sublists of the instance, test it with the function ```is_acceptable(solution, instance)```, sum every acceptable list (to compute the duration of each acceptable solution) and return the maximum. 

In [142]:
def is_acceptable(solution, instance):
    res=True 
    for x,y in zip(solution[:-1],solution[1:]):
        i= instance.index(x)
        #index1 = np.where(np.array(instance) == x)
        #for i in index[0]:
        #if the next element is y 
        if instance[i+1] == y:
            res = False
    return res

def optimal_solution(instance):
    sublists=[]
    
    for i in range(1, len(instance)+1):
        sublists+=[list(x) for x in combinations(instance, i)]
        
    mask = [is_acceptable(solution, instance) for solution in sublists]
    acceptable_sol = np.array(sublists)[mask]
    
    durations = [sum(L) for L in acceptable_sol]
    max_duration = max(durations)
    
    index_optimal_solutions = np.where(np.array(durations)==max_duration)
    
    return list(np.array(acceptable_sol)[index_optimal_solutions]), max_duration

In [143]:
instance = [30, 40, 25, 50, 30, 20]
optimal_solutions, max_duration = optimal_solution(instance)
print(optimal_solutions, max_duration)

[[40, 50, 20]] 110
