In [None]:
!pip install chromadb
!pip install sentence_transformers
!pip install langchain

## Libraries

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

In [None]:
import urllib3
from urllib.parse import urljoin
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

tokenizer = AutoTokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")
model = AutoModelForSeq2SeqLM.from_pretrained("unicamp-dl/translation-pt-en-t5")
pten_pipeline = pipeline('text2text-generation', model=model, tokenizer=tokenizer)

## Web Crawling

In [None]:
def crawling(url_page):

    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    http = urllib3.PoolManager()
    resultados = []

    try:
        page_data = http.request('GET', url_page)
    except:
        print("Error: " + url_page)


    soup = BeautifulSoup(page_data.data, "xml")

    links = soup.find_all('a')
    counter = 0

    for link in links:

        if("href" in link.attrs):
            url = urljoin(url_page, str(link.get('href')))

            if url.find("'") != -1:
                continue

            url = url.split("#")[0]
            resultados.append(url)
            counter += 1

    print("Number of links: " +  str(counter))

    return resultados

## Selecting links

In [None]:
links = crawling("https://en.wikipedia.org/wiki/Programming_language")
links = links[500:520]
links

## Extracting text

In [None]:
arq = open("pages_content.txt", "w")

In [None]:
def get_text(html):

    for tags in html(['script', 'style']):
        tags.decompose

    return ' '. join(html.stripped_strings)

In [None]:
def extract_information(url):

    html = urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')
    text = get_text(soup)

    words = text.split()
    text_with_line_breaks = ' '.join([word + ('\n' if (i + 1) % 100 == 0 else ' ') for i, word in enumerate(words)])

    arq.write(text_with_line_breaks)
    arq.write("\n")

In [None]:
for link in links:
  extract_information(link)

In [None]:
arq.close()

## Document loaders

In [None]:
loader = TextLoader('pages_content.txt')
documents = loader.load()

## Document transformers

In [None]:
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 200,
    chunk_overlap  = 20,
    length_function = len,
    is_separator_regex = False,
)

In [None]:
#text_splitter = CharacterTextSplitter (chunk_size=25, chunk_overlap=0)
texts= text_splitter.split_documents(documents)
len(texts)

## Text embedding models

In [None]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

## Vector stores

In [None]:
db = Chroma.from_documents(texts, embeddings)

In [None]:
db._collection.get(include=['embeddings'])

## Retrievers

In [None]:
retriever = db.as_retriever(search_kwargs={"k": 1})

In [None]:
retriever

## Question 1

In [None]:
docs = retriever.get_relevant_documents("Who created Python programming language??")
docs

## Question 2

In [None]:
docs = retriever.get_relevant_documents("Is C a object oriented language?")
docs


## Question 3

In [None]:
docs = retriever.get_relevant_documents("Is JavaScript used in web programming?")
docs


## Question 4

In [None]:
docs = retriever.get_relevant_documents("What are the programming paradigms?")
docs