# Overview
This is my attempt to solve the first assigment in the Information Retrieval course offered in Innopolis University

# Task1

In [4]:
import requests 
from urllib.parse import quote
import hashlib
import os

class Document:
    no_url_msg = "\nAN URL MUST BE SET BEFORE PROCEEDING WITH A 'Document' OBJECT\n"


    def __init__(self, url):
        self.url = url

    def __get_file_name(self):
        if not self.url:
            print(self.no_url_msg)
            return None
        
        # first extract the hashed name
        file_name = hashlib.md5(self.url.encode('utf-8')).hexdigest()
        # save the file in the current directory
        file_name = os.path.join(os.getcwd(), f'{file_name}.txt')

        return file_name

    def get(self):
        if not self.load():
            if not self.download():
                raise FileNotFoundError(self.url)
            else:
                self.persist()


    def download(self):
        if not self.url:
            print(self.no_url_msg)
            return False

        r =  requests.get(url=self.url, allow_redirects=True)
        if r.status_code != 200:
            print(f"the connection to the site {self.url} \nwas not successful.")
            return False
        
        self.content = r.content
        return True
    

    def persist(self):
        file_name = self.__get_file_name()
        if file_name is None:
            return False

        try:
            with open(file_name, 'wb') as f: 
                f.write(self.content)
            return True

        except FileNotFoundError as ffe:
            print("the file has not been yet created!!")
            return False

        except Exception as e :
            print(e)
            return False


    def load(self):
        file_name = self.__get_file_name()
        if file_name is None:
            return False

        try:
            with open(file_name, 'rb') as f: 
                # set the file's content to the content field
                self.content = f.read()
            return True
        
        except FileNotFoundError as ffe:
            # print("the file has not been yet created!!")
            return False

        except Exception as e :
            print(e)
            return False
        

In [5]:
doc = Document('http://sprotasov.ru/data/iu.txt')

doc.get()
assert doc.content, "Document download failed"
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document content error"

doc.get()
assert doc.load(), "Load should return true for saved document"
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document load from disk error"


# Task 2

In [6]:
from bs4 import BeautifulSoup as bs
from bs4.element import Comment
import urllib.parse
from urllib.parse import urljoin
import re

class HtmlDocument(Document):
    def __init__(self, url, text_join_str=' '):
        super().__init__(url)
        # this character is used to join all the pieces of visible text scapped from the site 
        self.text_join_str = text_join_str

    def parse(self):
        # this function assumes the self.content is already set
        if self.content is None:
            print("Please make sure to have the 'content' field set")        
            return
        # create the soup object to parse the html document
        doc_soup = bs(self.content, 'html.parser')
        # extract anchors
        self.anchors = [(str(link.string).strip() if link.string is not None else '', 
        urljoin(self.url, link['href'])) for link in doc_soup.find_all('a', href=True)] # filter those anchors with no actual link associated with them
        
        # extract images
        self.images = [urljoin(self.url, img['src']) for img in doc_soup.find_all('img', src=True)] # filter the anchors with no actual image associated with them.
        # extract text 
        # firstly extract all text
        raw_text = doc_soup.findAll(string=True)
        # secondly filter the text associated with unwanted tags
        def plain_text(element):
            return  element.parent.name not in ['style', 'script', 'title', 'head', 'meta', '[document]'] and not isinstance(element, Comment) 

        filtered_text = filter(plain_text, raw_text) # remove unwanted text
        # join all the text into a single text by the text_join_char attribute
        self.text = re.sub(r"\s+", ' ',self.text_join_str.join([t.strip() for t in filtered_text if t.strip()]))
        
                

In [7]:
doc = HtmlDocument("http://sprotasov.ru")
doc.get()
doc.parse()

print(doc.anchors)
print(doc.images)
print(doc.text)


assert "http://sprotasov.ru/images/gb.svg" in doc.images, "Error parsing images"
assert any(p[1] == "https://twitter.com/07C3" for p in doc.anchors), "Error parsing links"
assert "just few links" in doc.text, "Error parsing text"

[('telegram', 'https://t.me/sprotasov'), ('email', 'mailto:stanislav.protasov@gmail.com'), ('Curriculum vitae', 'https://docs.google.com/document/d/e/2PACX-1vQqlsxmlbkwp7CypdNg5vcl9zEfE1w6EFppJ2iBbHpZrpOI0AIzFkeu21-Or1_PYlnq1ICyLR1qaNlu/pub'), ('Google Scholar', 'https://scholar.google.ru/citations?user=pDske8oAAAAJ'), ('GitHub', 'https://github.com/str-anger'), ('Track record in Quantum', 'http://sprotasov.ru/q.html'), ('ResearchGate', 'https://www.researchgate.net/profile/Stanislav-Protasov'), ('Публикации в eLibrary', 'http://elibrary.ru/author_items.asp?authorid=789317'), ('Facebook', 'https://www.facebook.com/stanislav.protasov'), ('LinkedIn', 'https://www.linkedin.com/pub/stanislav-protasov/28/651/b38'), ('Research with Stas telegram channel', 'https://t.me/iu_aml'), ('', 'https://t.me/origin_of_species'), ('iTunes', 'https://itunes.apple.com/ru/podcast/происхождение-видов/id1282666034'), ('RSS', 'http://sprotasov.ru/podcast/rss.xml'), ('Automatic testing system', 'http://code-te

# Task 3

In [8]:
! pip install nltk 
! pip install spacy
! pip install langdetect
! pip install langcodes
! pip install language_data



In [9]:
# download the data needed for nltk
import nltk
nltk.download('punkt') 
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bouab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bouab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# let's use some simple NLP tools here
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize 
from langdetect import detect
from collections import Counter
from langcodes import Language

from nltk.corpus import stopwords
from string import punctuation 

import re

punc_regex = r'.*[!\"#\$%&\'\(\)\*\+,-\.:;<=>\?@\[\\\]\^_`{\|}~«»]+.*'

class HtmlDocumentTextData:
    
    def __init__(self, url):        
        self.doc = HtmlDocument(url)
        self.doc.get()
        self.doc.parse()
    
    def _detect_language(self):
        # to use the power of NLP tools, regardless of the text's language, language detection is needed
        text = self.doc.text
        # determine the language used in the text
        lan_code = detect(text)
        # the language variable represents the code of the language and not its standard form
        # the langcodes package is in for the rescue
        language = Language.make(language=lan_code).display_name().lower()
        return language, set(stopwords.words(language))        

    def get_sentences(self):
        text = self.doc.text
        lang, _ = self._detect_language()
        # the main limitation of this class is that it assumes the text's language is english
        result = sent_tokenize(text, language=lang)
        return result
    
    def get_word_stats(self):
        text = self.doc.text
        lang, stop_words = self._detect_language()
        words = [w.lower().strip() for w in word_tokenize(text, language=lang) if w.lower().strip() not in stop_words and re.match(punc_regex, w.lower().strip()) is None]
        # create the counter and map each word to its frequency
        counter = Counter()
        for w in words:
            counter[w] += 1 
        return counter

In [11]:
doc = HtmlDocumentTextData("https://innopolis.university/")

print(doc.get_word_stats().most_common(10))
assert [x for x in doc.get_word_stats().most_common(10) if x[0] == 'иннополис'], 'иннополис should be among most common'

[('иннополис', 22), ('университет', 12), ('университета', 12), ('центр', 10), ('образование', 8), ('робототехники', 6), ('деятельность', 6), ('управления', 5), ('научные', 5), ('образовательной', 5)]


# task 4

In [17]:
from queue import Queue

class Crawler:
    def crawler_generator(self, source, depth=1):
        if depth <= 1:
            html_doc = HtmlDocumentTextData(source)
            results =  [html_doc] if html_doc.doc.content else []
        else: 
            top_doc = HtmlDocumentTextData(source)
            results = [top_doc] if top_doc.doc.content else []
            try:
                for link in top_doc.doc.anchors:
                    results.extend(self.crawler_generator(link[1], depth=depth-1))
            except ValueError:
                # the connection to the site was not successful
                pass
             
        return results  

In [19]:
url = "http://sprotasov.ru"

craw = Crawler()

res = craw.crawler_generator(url, depth=2)

print(res)

[<__main__.HtmlDocumentTextData object at 0x000001C47AAC3D60>, <__main__.HtmlDocumentTextData object at 0x000001C41FA47E80>]
