# Overview
This is my attempt to solve the first assigment in the Information Retrieval course offered in Innopolis University

In [23]:
import argparse
import os
import re
import requests

def wget(url, file_name=None):
    # allow redirects
    response = requests.get(url, allow_redirects=True)
    # only proceed if the status code is 200
    if response.status_code != 200:
        print(response.status_code, response.reason, 'for', url)
        print("EXITING...")
        return None
    
    # if the file name wasn't given, infer it from the url
    if file_name is None:
        # let's consider 
        url_name_regex = r'^https?.*\/([^\/?#]*)[\?#]?'
        match_url = re.search(url_name_regex, url)
        file_name = match_url.group(1)
        # the file will be saved in the program's directory
        file_name = os.path.join(os.getcwd(), file_name)
        if not file_name:
            raise NameError(f'File name was neither given as an argument not could be inferred from {url}')
        
    # if two urls had the same file name, the second one would be rejected
    if os.path.exists(file_name):
        raise OSError(f'File {file_name} already exists!!!')
    
    # write the content of the url to a local file
    with open(file_name, 'wb') as f:
        f.write(response.content)
        print(f'File saved as {file_name}')
    
    return file_name


if __name__ == 'main':
    pass


In [24]:
# let's build a crawler...
import requests 
from urllib.parse import quote
import hashlib
import os

class Document:
    no_url_msg = "\nAN URL MUST BE SET BEFORE PROCEEDING WITH A 'DOCUMENT' OBJECT\n"


    def __init__(self, url):
        self.url = url

    def __get_file_name(self):
        if not self.url:
            print(self.no_url_msg)
            return None
        
        # first extract the hashed name
        file_name = hashlib.md5(self.url.encode('utf-8')).hexdigest()
        # save the file in the current directory
        file_name = os.path.join(os.getcwd(), f'{file_name}.txt')

        return file_name

    def get(self):
        if not self.load():
            if not self.download():
                raise FileNotFoundError(self.url)
            else:
                self.persist()


    def download(self):
        if not self.url:
            print(self.no_url_msg)
            return False

        r =  requests.get(url=self.url, allow_redirects=True)
        if r.status_code != 200:
            print("the connection was not successful.")
            return False
        
        self.content = r.content
        return True
    

    def persist(self):
        file_name = self.__get_file_name()
        if file_name is None:
            return False

        try:
            with open(file_name, 'wb') as f: 
                f.write(self.content)
            return True

        except FileNotFoundError as ffe:
            print("the file has not been yet created!!")
            return False

        except Exception as e :
            print(e)
            return False


    def load(self):
        file_name = self.__get_file_name()
        if file_name is None:
            return False

        try:
            with open(file_name, 'rb') as f: 
                # set the file's content to the content field
                self.content = f.read()
            return True
        
        except FileNotFoundError as ffe:
            print("the file has not been yet created!!")
            return False

        except Exception as e :
            print(e)
            return False
        

In [25]:
doc = Document('http://sprotasov.ru/data/iu.txt')

doc.get()
assert doc.content, "Document download failed"
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document content error"

doc.get()
assert doc.load(), "Load should return true for saved document"
print('3')
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document load from disk error"
print('4')

the file has not been yet created!!
3
4


## Task 2

In [26]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.parse


class HtmlDocument(Document):
    
    def parse(self):
        #TODO extract plain text, images and links from the document
        self.anchors = [("fake link text", "http://fake.url/")]
        self.images = ["http://image.com/fake.jpg"]
        self.text = "fake text and some other text"