In [1]:
from bs4 import BeautifulSoup

from EUDirective import EUDirective

In [2]:
import langchain
import nltk

In [43]:
class SentenceSplitter(langchain.text_splitter.TextSplitter):
    def __init__(self, max_chunk_size=4000, chunk_overlap=200, sentence_limit=20, ignore_length=3):
        self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        self.max_chunk_size = max_chunk_size
        self.chunk_overlap = chunk_overlap
        self.sentence_limit = sentence_limit
        self.ignore_length = ignore_length

        self._add_start_index = False
            


    def split_text(self, string):
        def aggregate_strings_to_limit(strings, limit, ignore):
            current_strings = []
            for s in strings:
                if len(s)<=ignore:
                    continue
                if sum(map(len, current_strings+[s])) > limit:
                    yield ' '.join(current_strings + [s])
                    current_strings = []
                else:
                    current_strings.append(s)

        g = aggregate_strings_to_limit(
                                      [c for b in string.split('\n') for c in  self.tokenizer.tokenize(b)],
                                      self.sentence_limit,
                                      self.ignore_length)
        
        return [s for s in g]
    



directive = EUDirective('crowdfunding.html')

splitter = SentenceSplitter(sentence_limit = 50, ignore_length=5)

texts = splitter.split_documents(directive.get_documents())

Starting article Article 1. Currently in article: False
Starting article Article 2. Currently in article: True
Starting article Article 3. Currently in article: True
Starting article Article 4. Currently in article: True
Starting article Article 5. Currently in article: True
Starting article Article 6. Currently in article: True
Starting article Article 7. Currently in article: True
Starting article Article 8. Currently in article: True
Starting article Article 9. Currently in article: True
Starting article Article 10. Currently in article: True
Starting article Article 11. Currently in article: True
Starting article Article 12. Currently in article: True
Starting article Article 13. Currently in article: True
Starting article Article 14. Currently in article: True
Starting article Article 15. Currently in article: True
Starting article Article 16. Currently in article: True
Starting article Article 17. Currently in article: True
Starting article Article 18. Currently in article: True


In [44]:
directive.articles

{'Article 1': 'Subject matter, scope and exemptions\n1.   This Regulation lays down uniform requirements for the provision of crowdfunding services, for the organisation, authorisation and supervision of crowdfunding service providers, for the operation of crowdfunding platforms as well as for transparency and marketing communications in relation to the provision of crowdfunding services in the Union.\n2.   This Regulation does not apply to:\n(a)\ncrowdfunding services that are provided to project owners that are consumers, as defined in point (a) of Article 3 of Directive 2008/48/EC;\n(b)\nother services related to those defined in point (a) of Article 2(1) and that are provided in accordance with national law;\n(c)\ncrowdfunding offers with a consideration of more than EUR 5 000 000, which are to be calculated over a period of 12 months as the sum of:\n(i)\nthe total consideration of offers of transferable securities and admitted instruments for crowdfunding purposes as defined in po

In [5]:
from langchain.embeddings import FakeEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [6]:
query = """Phrases related to "union law" """

embeddings = FakeEmbeddings(size=1024)
docsearch = Chroma.from_texts([t.page_content for t in texts], embeddings, metadatas=[t.metadata for t in texts])

docs = docsearch.similarity_search_with_score(query)

In [46]:
texts[0:5]

[Document(page_content='Subject matter, scope and exemptions This Regulation lays down uniform requirements for the provision of crowdfunding services, for the organisation, authorisation and supervision of crowdfunding service providers, for the operation of crowdfunding platforms as well as for transparency and marketing communications in relation to the provision of crowdfunding services in the Union.', metadata={'Directive': 'REGULATION (EU) 2020/1503 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL', 'Date': '7 October 2020', 'Article': 'Article 1'}),
 Document(page_content='This Regulation does not apply to: crowdfunding services that are provided to project owners that are consumers, as defined in point (a) of Article 3 of Directive 2008/48/EC;', metadata={'Directive': 'REGULATION (EU) 2020/1503 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL', 'Date': '7 October 2020', 'Article': 'Article 1'}),
 Document(page_content='other services related to those defined in point (a) of Article 2

In [9]:
embeddings = OpenAIEmbeddings()

In [11]:
#vectordb = Chroma.from_documents(documents=texts,
#                                 embedding=embeddings,
#                                 persist_directory='db')
#vectordb.persist()

In [47]:
vectordb = Chroma(embedding_function=embeddings,
                                 persist_directory='db')

In [61]:
vectordb.persist()

In [48]:
#vectordb.add_documents(texts)

['84291416-0853-11ee-9e04-00155dd05310',
 '84291dee-0853-11ee-9e04-00155dd05310',
 '84291e70-0853-11ee-9e04-00155dd05310',
 '84291eac-0853-11ee-9e04-00155dd05310',
 '84291ede-0853-11ee-9e04-00155dd05310',
 '84291f10-0853-11ee-9e04-00155dd05310',
 '84291f38-0853-11ee-9e04-00155dd05310',
 '84291f60-0853-11ee-9e04-00155dd05310',
 '84291f88-0853-11ee-9e04-00155dd05310',
 '84291fba-0853-11ee-9e04-00155dd05310',
 '84291fd8-0853-11ee-9e04-00155dd05310',
 '84292000-0853-11ee-9e04-00155dd05310',
 '84292028-0853-11ee-9e04-00155dd05310',
 '84292050-0853-11ee-9e04-00155dd05310',
 '84292078-0853-11ee-9e04-00155dd05310',
 '842920a0-0853-11ee-9e04-00155dd05310',
 '842920be-0853-11ee-9e04-00155dd05310',
 '842920e6-0853-11ee-9e04-00155dd05310',
 '8429210e-0853-11ee-9e04-00155dd05310',
 '84292136-0853-11ee-9e04-00155dd05310',
 '8429215e-0853-11ee-9e04-00155dd05310',
 '8429217c-0853-11ee-9e04-00155dd05310',
 '842921a4-0853-11ee-9e04-00155dd05310',
 '842921cc-0853-11ee-9e04-00155dd05310',
 '842921f4-0853-

In [19]:
query = 'The phrase union financial services legislation'
docs = vectordb.similarity_search(query=query, k=20)

for d in docs:
    print(d.page_content)

In [76]:
query = 'penalties or other sanctions'
docs = vectordb.similarity_search(query=query, k=40)

for d in docs:
    print(d.page_content)

Administrative penalties and other administrative measures
CHAPTER VII Administrative penalties and other administrative measures
Those administrative penalties and other administrative measures shall apply at least to:
any penalties imposed on the crowdfunding service provider or its managers.
Exercise of supervisory powers and powers to impose penalties
the number and amount of administrative fines and criminal penalties imposed according to or in relation with this Regulation classified by Member States;
The reporting of penalties and administrative measures to ESMA
the application of administrative penalties and other administrative measures and, in particular, any need to further harmonise the administrative penalties provided for infringements of this Regulation;
Member States may provide for additional penalties or measures and for higher levels of administrative fines than those provided for in this Regulation, in respect of both natural and legal persons responsible for the in

In [51]:
prompt = PromptTemplate(template="The following sentences are taken from various EU directives: {sentences}. {query} Only answer with exact phrases found in the given sentences",
                        input_variables = ['sentences', 'query'])

In [56]:
query

2798

In [77]:
response = llm(prompt.format(sentences = '\n'.join([d.page_content for d in docs]), query="List the references to penalties or other sanctions."))

In [78]:
response

' \n\n"Administrative penalties", "other administrative measures", "criminal penalties", "penalties imposed", "penalties or other measures", "administrative fines", "criminal investigations undertaken", "criminal penalties imposed", "penalties and administrative measures".'

In [70]:
response.split('" "')

['.\n\n"Third countries,',
 'entities established in third countries,',
 'services provided by them directly or by a third party,',
 'crowdfunding service provider authorised in that other Member State,',
 'controlled by the same natural or legal persons who control a crowdfunding service provider authorised in that other Member State,',
 'crowdfunding service providers that provide crowdfunding services on a cross-border basis,',
 'competent authority of another Member State,',
 'authorities responsible for the oversight of such other activities as provided for in the relevant Union or national law," and "competent authority which submitted the request."']

In [26]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

In [30]:
llm = OpenAI()

In [32]:
from langchain.prompts import PromptTemplate

In [40]:
template.format_prompt(sentences='\n'.join([d.page_content for d in docs]))

StringPromptValue(text='The following sentences are taken from European Union directives: ‘competent authority’ means one or more competent authorities:\nWhere notified of the matters listed in the second subparagraph, points (a) to (e), the competent authority may require the operator of the DLT market infrastructure to make an application in accordance with Article 8(13), Article 9(13) or Article 10(13), or may require the operator of the DLT market infrastructure to take corrective measures as referred to in paragraph 3 of this Article.\nany material change to the information provided to the competent authority;\nextended for a further period of up to three years;\nthe operator of DLT TSS will not be able to comply, or will not allow its users to comply, with applicable provisions of Union law or provisions of national law falling outside the scope of Union Law.\nany refusals to grant specific permissions or exemptions, any modifications or withdrawals of such specific permissions o