# Overview
This is my attempt to solve the first assigment in the Information Retrieval course offered in Innopolis University

# Task1

In [1]:
import requests 
from urllib.parse import urlparse
import hashlib
import os


class Document:
    no_url_msg = "\nAN URL MUST BE SET BEFORE PROCEEDING WITH A 'Document' OBJECT\n"


    def __init__(self, url):
        self.url = url

    def _extract_file_extension(self, url):
        '''
            Finding the extension of the file or the link
        '''
        parsed_url = urlparse(url)
        path = parsed_url.path
        if not path:
            return "html"
        _, ext = os.path.splitext(path)
        return ext[1:] if ext else "html"


    def __get_file_name(self):
        if not self.url:
            print(self.no_url_msg)
            return None
        
        # first extract the hashed name
        file_name = hashlib.md5(self.url.encode('utf-8')).hexdigest()
        # add the extension
        file_name += f'.{self._extract_file_extension(self.url) }'
        # save the file
        file_name = os.path.join(os.getcwd(), file_name)

        return file_name

    def get(self):
        if not self.load():
            if not self.download():
                raise FileNotFoundError(self.url)
            else:
                self.persist()


    def download(self):
        if not self.url:
            print(self.no_url_msg)
            return False

        r =  requests.get(url=self.url, allow_redirects=True)    
        if r.status_code != 200:
            print(f"The status code recieved is {r.status_code}.Please make sure the url: {self.url}\n is valid and you have the needed permissions\n")
            return False
        
        self.content = r.content
        return True
    

    def persist(self):
        file_name = self.__get_file_name()
        if file_name is None:
            return False

        try:
            with open(file_name, 'wb') as f: 
                f.write(self.content)
            return True

        except FileNotFoundError as ffe:
            print("the file has not been yet created!!")
            return False

        except Exception as e :
            print(e)
            return False


    def load(self):
        file_name = self.__get_file_name()
        
        if file_name is None:
            return False

        try:
            with open(file_name, 'rb') as f: 
                # set the file's content to the content field
                self.content = f.read()
            return True
        
        except FileNotFoundError as ffe:
            # print("the file has not been yet created!!")
            return False

        except Exception as e :
            print(e)
            return False
        

In [2]:
doc = Document('http://sprotasov.ru/data/iu.txt')

doc.get()
assert doc.content, "Document download failed"
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document content error"

doc.get()
assert doc.load(), "Load should return true for saved document"
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document load from disk error"


# Task 2

In [3]:
from bs4 import BeautifulSoup as bs
from bs4.element import Comment
import urllib.parse
from urllib.parse import urljoin
import re
import html5lib

class HtmlDocument(Document):
    def __init__(self, url, text_join_str=' '):
        super().__init__(url)
        # this character is used to join all the pieces of visible text scapped from the site 
        self.text_join_str = text_join_str

    def _is_html(self):
        # this function determines whether the parsed document is html of xml
        try:
            p = html5lib.HTMLParser()
            p.parse(self.content)   
            return True
        except Exception as e:
            print(e)
            return False


    def parse(self):
        # this function assumes the self.content is already set
        if self.content is None:
            print("Please make sure to have the 'content' field set")        
            return
                
        # create the soup object to parse the html document
        doc_soup = bs(self.content, 'html.parser' if self._is_html() else 'lxml')

        # extract anchors
        self.anchors = [(str(link.text) if link is not None else '', 
        urljoin(self.url, link['href'])) for link in doc_soup.find_all('a', href=True)] # filter those anchors with no actual link associated with them
        
        # extract images
        self.images = [urljoin(self.url, img['src']) for img in doc_soup.find_all('img', src=True)] # filter the anchors with no actual image associated with them.
        
        # extract text 
        # firstly extract all text
        raw_text = doc_soup.findAll(string=True)
        # secondly filter the text associated with unwanted tags
        def plain_text(element):
            return  element.parent.name not in ['style', 'script', 'title', 'head', 'meta', '[document]'] and not isinstance(element, Comment) 

        filtered_text = filter(plain_text, raw_text) # remove unwanted text
        # join all the text into a single text by the text_join_char attribute
        self.text = re.sub(r"\s+", ' ',self.text_join_str.join([t.strip() for t in filtered_text if t.strip()]))
        
                

In [4]:
doc = HtmlDocument("http://sprotasov.ru")
doc.get()
doc.parse()


# print(doc.anchors)
# print(doc.images)
# print(doc.text)

assert "http://sprotasov.ru/images/gb.svg" in doc.images, "Error parsing images"
assert any(p[1] == "https://twitter.com/07C3" for p in doc.anchors), "Error parsing links"
assert "just few links" in doc.text, "Error parsing text"

# Task 3

In [5]:
! pip install nltk 
! pip install spacy
! pip install langdetect
! pip install langcodes
! pip install language_data



In [6]:
# download the data needed for nltk
import nltk
nltk.download('punkt') 
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bouab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bouab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
used_languages = ["german", 'spanish', 'english', 'russian', 'chinese', 'arabic']

In [8]:
# let's use some simple NLP tools here
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize 
from langdetect import detect
from collections import Counter
from langcodes import Language

from nltk.corpus import stopwords
import re

punc_regex = r'.*[!\"#\$%&\'\(\)\*\+,\.:;<=>\?@\[\\\]\^_`{\|}~«»]+.*'

class HtmlDocumentTextData:
    
    def __init__(self, url):        
        self.doc = HtmlDocument(url)
        self.doc.get()
        self.doc.parse()
    
    def _detect_language(self):
        # to use the power of NLP tools, regardless of the text's language, language detection is needed
        text = self.doc.text
        # determine the language used in the text
        lan_code = detect(text)
        # the language variable represents the code of the language and not its standard form
        # the langcodes package is in for the rescue
        language = Language.make(language=lan_code).display_name().lower()
        if language not in used_languages: 
            return None
        return language, set(stopwords.words(language))        

    def get_sentences(self):
        text = self.doc.text
        lang, _ = self._detect_language()
        # the main limitation of this class is that it assumes the text's language is english
        result = sent_tokenize(text, language=lang)
        return result
    
    def get_word_stats(self):
        
        text = self.doc.text
        lang = self._detect_language()
        words = []

        if lang is not None:
            lang, stop_words = lang            
            try:
                # remove any stop words (also known as filler words as well as any "words" containing punctuation marks)
                words = [w.lower().strip() for w in word_tokenize(text, language=lang) if w.lower().strip() not in stop_words and re.match(punc_regex, w.lower().strip()) is None]
            except Exception as e:
                pass  
        
        # create the counter and map each word to its frequency
        counter = Counter()
        for w in words:
            counter[w] += 1 
        return counter

In [9]:
doc = HtmlDocumentTextData("https://innopolis.university/")

print(doc.get_word_stats().most_common(10))
assert [x for x in doc.get_word_stats().most_common(10) if x[0] == 'иннополис'], 'иннополис should be among most common'

[('иннополис', 20), ('университет', 11), ('университета', 11), ('центр', 10), ('образование', 8), ('робототехники', 6), ('деятельность', 6), ('управления', 5), ('образовательной', 5), ('2022', 5)]


# task 4

In [10]:
from requests.exceptions import RequestException 


class Crawler:
    def crawl_generator(self, source, depth=1):
        # the minimum value of depth is '1'
        depth = max(1, depth)

        try:            
            top_doc = HtmlDocumentTextData(source)
            yield  (top_doc, top_doc.doc.url)

            if depth > 1: 
                for link in top_doc.doc.anchors:
                    for value in self.crawl_generator(link[1], depth=depth - 1):
                        yield value

        except RequestException as reqe:
            print(f"the url: {source} is not valid")
            print(reqe)
        except FileNotFoundError:
            pass
        except:
            pass


In [11]:
crawler = Crawler()
counter = Counter()

for c2 in crawler.crawl_generator("https://innopolis.university/en/", 2):
    c = c2[0]
    print(c.doc.url)
    if c.doc.url[-4:] in ('.pdf', '.mp3', '.avi', '.mp4', '.txt'):
        print("Skipping", c.doc.url)
        continue
    counter.update(c.get_word_stats())
    print(len(counter), "distinct word(s) so far")
    
print("Done")

print(counter.most_common(20))
assert [x for x in counter.most_common(20) if x[0] == 'innopolis'], 'innopolis sould be among most common'

https://innopolis.university/en/
283 distinct word(s) so far
https://innopolis.university/
795 distinct word(s) so far
https://innopolis.university/en/
795 distinct word(s) so far
https://innopolis.university/en/
795 distinct word(s) so far
https://innopolis.university/en/
795 distinct word(s) so far
https://innopolis.university/en/
795 distinct word(s) so far
https://innopolis.university/en/
795 distinct word(s) so far
https://innopolis.university/en/
795 distinct word(s) so far
https://innopolis.university/en/
795 distinct word(s) so far
https://apply.innopolis.university/en
1393 distinct word(s) so far
The status code recieved is 404.Please make sure the url: https://innopolis.university/proekty/activity/en"
 is valid and you have the needed permissions

https://media.innopolis.university/en
1457 distinct word(s) so far
https://innopolis.university/lk/
1467 distinct word(s) so far
https://innopolis.university/en/about/
1643 distinct word(s) so far
https://innopolis.university/en/boa

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://innopolis.university/public/files/Consent_to_the_processing_of_PD_for_UI.pdf
Skipping https://innopolis.university/public/files/Consent_to_the_processing_of_PD_for_UI.pdf
https://t.me/universityinnopolis
5952 distinct word(s) so far
https://vk.com/innopolisu
6108 distinct word(s) so far
https://www.youtube.com/user/InnopolisU
6121 distinct word(s) so far
https://apply.innopolis.ru/en/
7663 distinct word(s) so far
https://innopolis.university/en/proekty/activity/
7663 distinct word(s) so far
https://innopolis.university/en/about/
7663 distinct word(s) so far
https://career.innopolis.university/en/
7663 distinct word(s) so far
https://innopolis.university/en/
7663 distinct word(s) so far
https://innopolis.university/en/
7663 distinct word(s) so far
https://innopolis.university/en/
7663 distinct word(s) so far
the url: mailto:319@innopolis.ru is not valid
No connection adapters were found for 'mailto:319@innopolis.ru'
https://panoroo.com/virtual-tours/NvQZM6B2
7663 distinct word(s

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://innopolis.university/public/files/Consent_to_the_processing_of_PD_for_UI.pdf
Skipping https://innopolis.university/public/files/Consent_to_the_processing_of_PD_for_UI.pdf
Done
[('university', 1949), ('innopolis', 1057), ('education', 796), ('research', 773), ('international', 585), ('students', 574), ('development', 464), ('•', 444), ('russia', 423), ('science', 401), ('-', 398), ('russian', 384), ('faculty', 375), ('program', 373), ('robotics', 369), ('иннополис', 329), ('student', 327), ('information', 301), ('teaching', 283), ('activities', 272)]
